%% ================================================================================ %% This LaTeX file was created by AbiWord. %% AbiWord is a free, Open Source word processor. %% You may obtain more information about AbiWord at www.abisource.com %% ================================================================================ \documentclass[12pt]{article} \usepackage[T1]{fontenc} \usepackage{calc} \usepackage{hyperref}\usepackage{setspace} \usepackage{multicol} \usepackage[normalem]{ulem} \usepackage{color} \setlength{\oddsidemargin}{1.250000in-1in} \setlength{\textwidth}{\paperwidth - 1.250000in-1.250000in} \begin{document} \begin{center} \end{itemize} \end{slide} \begin{slide}{Perl and Regular Expressions} \begin{itemize} \end{center} \begin{center} Regular Expressions are available as part of the programming languages Java, JScript, Visual Basic and VBScript, JavaScript, C, C++, C\#, elisp, Perl, Python, Ruby, PHP, sed, awk, and in many applications, such as editors, grep, egrep. \end{center} \begin{center} Regular Expressions help you master your data. \end{center} \end{itemize} \end{slide} \begin{slide}{What is a Regular Expression?} \begin{itemize} \item Powerful. \item Low level description: \item Describes some text \item Can use to: \item Verify a user's input \item Sift through large amounts of data \item High level description: \item Allow you to master your data \end{itemize} \end{slide} \begin{slide}{Regular Expressions as a language} \begin{itemize} \item Can consider regular expressions as a language \item Made of two types of characters: \item \textit{Literal} characters \item Normal text characters \item Like words of the program \item \textit{Metacharacters} \item The special characters }+ ? . * \^{} \$ ( ) [ \{ | \ensuremath{\backslash \item Act as the grammar that combines with the words according to a set of rules to create and expression that communicates an idea \item \begin{center} \end{itemize} \end{slide} \begin{slide}{How to use a Regular Expression} \begin{itemize} \end{center} \begin{center} How to make a regular expression as part of your program \end{center} \end{itemize} \end{slide} \begin{slide}{What do they look like?} \begin{itemize} \item In Perl, a regular expression begins and ends with }/}}{, like this: \textbf{{/abc/ \item /abc/{ }{matches the string "abc"} \item Are these literal characters or metacharacters? \item Returns true if matches, so often use as condition in an if statement \end{itemize} \end{slide} \begin{slide}{Example: searching for "Course:"} \begin{itemize} \item Problem: want to print all lines in all input files that contain the string "Course:" \item while ( <> ) \{ \item my \$line = \$\_; \item if ( \$line =\~{} /Course:/ ) \{ \item print \$line; \item \} \item \} \item Or more concisely: \item while ( <> ) \{ \item print if \$\_ =\~{} /Course:/; \item \} \end{itemize} \end{slide} \begin{slide}{The "match operator" }}\LARGE{}=\~{ \begin{itemize} \item If just use /Course:/, this returns true if \$\_ contains the string Course: \item If want to test another string variable }\$var{ to see if it contains the regular expression, use \item \$var =\~{} /regular expression/ \item Under what condition is this true? \end{itemize} \end{slide} \begin{slide}{The "match operator" }}\LARGE{}=\~{} \textbf{{\LARGE{}2} \begin{itemize} \item \# sets the string to be searched: \item \$\_ = "perl for Win32"; \item \# is 'perl' inside \$\_? \item if ( \$\_ =\~{} /perl/ ) \{ print "Found perl\ensuremath{\backslash}n" \}; \item \item \# Same as the regex above. \item \# Don't need the =\~{} as we are testing \$\_: \item if ( /perl/ ) \{ print "Found perl\ensuremath{\backslash}n" \}; \end{itemize} \end{slide} \begin{slide}{/i}}\textbf{{\LARGE{} Matching without case sensitivity} \begin{itemize} \item \$\_ = "perl for Win32"; \item \# this will fail because the case doesn't match: \item if ( /PeRl/ ) \{ print "Found PeRl\ensuremath{\backslash}n" \}; \item \# this will match, because there is an 'er' in 'perl': \item if ( /er/ ) \{ print "Found er\ensuremath{\backslash}n" \}; \item \# this will match, because there is an 'n3' in 'Win32': \item if ( /n3/ ) \{ print "Found n3\ensuremath{\backslash}n" \}; \item \# this will fail because the case doesn't match: \item if ( /win32/ ) \{ print "Found win32\ensuremath{\backslash}n" \}; \item \item \# This matches because the /i at the end means \item \# "match without case sensitivity": \item if ( /win32/i ) \{ print "Found win32 (i)\ensuremath{\backslash}n" \}; \end{itemize} \end{slide} \begin{slide}{Using }}\LARGE{}!\~{}}}\textbf{{\LARGE{} instead of }}\textbf{{\LARGE{}=\~{ \begin{itemize} \item \# Looking for a space: \item print "Found!\ensuremath{\backslash}n" if / /; \item \# both these are the same, but reversing the logic with \item \# unless and !\~{} \item print "Found!!\ensuremath{\backslash}n" unless \$\_ !\~{} / /; \item print "Found!!\ensuremath{\backslash}n" unless ! / /; \end{itemize} \end{slide} \begin{slide}{Embedding variables in regexps} \begin{itemize} \item \# Create two variables containing regular expressions \item \# to search for: \item my \$find = 32; \item my \$find2 = " for "; \item \item if ( /\$find/ ) \{ print "Found '\$find'\ensuremath{\backslash}n" \}; \item if ( /\$find2/ ) \{ print "Found '\$find2'\ensuremath{\backslash}n" \}; \item \item \# different way to do the above: \item print "Found \$find2\ensuremath{\backslash}n" if /\$find2/; \item \begin{center} \end{itemize} \end{slide} \begin{slide}{The Metacharacters} \begin{itemize} \end{center} \begin{center} The funny characters \end{center} \begin{center} What they do \end{center} \begin{center} How to use them \end{center} \end{itemize} \end{slide} \begin{slide}{Character Classes }}\textbf{{\LARGE{}[...]} \begin{itemize} \item my @names = ( "Nick", "Albert", "Alex", "Pick" ); \item foreach my \$name ( @names ) \{ \item if ( \$name =\~{} /[NP]ick/ ) \{ \item print "\$name: Out for a Pick Nick\ensuremath{\backslash}n"; \item else \{ \item print "\$name is not Pick or Nick\ensuremath{\backslash}n"; \item \} \item \} \item Square brackets \textit{match a single character} \end{itemize} \end{slide} \begin{slide}{Examples of use of }}\textbf{{\LARGE{}[...]} \begin{itemize} \item Match a capital letter: [ABCDEFGHIJKLMNOPQRSTUVWXYZ] \item Same thing: [A-Z] \item Match a vowel: [aeiou] \item Match a letter or digit: [A-Za-z0-9] \end{itemize} \end{slide} \begin{slide}{Negated character class: }}\textbf{{\LARGE{}[\^{}...]} \begin{itemize} \item Match any single character that is }\textit{{not}}{ a letter: }[\^{A-Za-z] \item Match any character that is not a space or a tab: }[\^{} \ensuremath{\backslasht] \end{itemize} \end{slide} \begin{slide}{Example using }}\textbf{{\LARGE{}[\^{}...]} \begin{itemize} \item This simple program prints only lines that contain characters that are not a space: \item while ( <> ) \item \{ \item print \$\_ if /[\^{} ]/; \item \} \item This prints lines that \textit{start with} a character that is not a space: \item while ( <> ) \item \{ \item print \$\_ if /\^{}[\^{} ]/; \item \} \item Notice that \textbf{\^{}} has two meanings: one inside \textbf{[...]}, the other outside. \end{itemize} \end{slide} \begin{slide}{Shorthand for Common Character Classes} \begin{itemize} \item Since matching a digit is very common, Perl provides }\ensuremath{\backslash}d}}{ as a short way of writing \\\textbf{{[0-9] \item \ensuremath{\backslash}D}}{ matches a non-digit: }\textbf{{[\^{}0-9] \item \ensuremath{\backslash}s}}{ matches any whitespace character; shorthand for }\textbf{{[ \ensuremath{\backslash}t\ensuremath{\backslash}n\ensuremath{\backslash}r\ensuremath{\backslash}f] \item \ensuremath{\backslash}S}}{ non-whitespace, }\textbf{{[\^{} \ensuremath{\backslash}t\ensuremath{\backslash}n\ensuremath{\backslash}r\ensuremath{\backslash}f] \item \ensuremath{\backslash}w}}{ word character, }\textbf{{[a-zA-Z0-9\_] \item \ensuremath{\backslash}W}}{ non-word character, }\textbf{{[\^{} a-zA-Z0-9\_] \end{itemize} \end{slide} \begin{slide}{Matching any character} \begin{itemize} \item The dot matches any character except a newline \item This matches any line with at least 5 characters: \item print if /...../; \end{itemize} \end{slide} \begin{slide}{Matching the beginning or end} \begin{itemize} \item to match a line that contains exactly five characters: \item print if /\^{}.....\$/; \item the \^{} matches the beginning of the line. \item the \$ matches at the end of the line \end{itemize} \end{slide} \begin{slide}{Matching Repetitions: }}\LARGE{}* + ? \{n,m\ \begin{itemize} \item To match zero or more: \item /a*/ will match zero or more letter a, so matches "", "a", "aaaa", "qwereqwqwer", or the nothing in front of }\textit{{anything}}{! \item to match at least one: \item /a+/ matches at least one "a" \item /a?/ matches zero or one "a" \item /a\{3,5\}/ matches between 3 and 5 "a"s. \end{itemize} \end{slide} \begin{slide}{Example using .*} \begin{itemize} \item \$\_ = 'Nick Urbanik '; \item print "found something in <>\ensuremath{\backslash}n" if /<.*>/; \item \item \# Find everything between quotes: \item \$\_ = 'He said, "Hi there!", and then "What\ensuremath{\backslash}'s up?"'; \item print "quoted!\ensuremath{\backslash}n" if /"[\^{}"]*"/; \item print "too much!\ensuremath{\backslash}n" if /".*"/; \end{itemize} \end{slide} \begin{slide}{Capturing the Match with }}\textbf{{\LARGE{}(...)} \begin{itemize} \item Often want to scan large amounts of data, extracting important items \item Use parentheses and regular expressions \item Silly example of capturing an email address: \item \$\_ = 'Nick Urbanik '; \item print "found \$1 in <>\ensuremath{\backslash}n" if /<(.*)>/; \end{itemize} \end{slide} \begin{slide}{Capturing the match: greediness} \begin{itemize} \item Look at this example: \item \$\_ = 'He said, "Hi there!", and then "What\ensuremath{\backslash}'s up?"'; \item print "\$1\ensuremath{\backslash}n" if /"([\^{}"]*)"/; \item print "\$1\ensuremath{\backslash}n" if /"(.*)"/; \item What will each print? \item The first one works; the second one prints:\\Hi there!", and then "What's up? \item Why? \item Because *, ?, +, \{m,n\} are }\textit{{greedy}}{! \item They match as much as they possibly can! \end{itemize} \end{slide} \begin{slide}{Being Stingy (not Greedy): }}\textbf{{\LARGE{}?} \begin{itemize} \item Usually greedy matching is what we want, but not always \item How can we match as little as possible? \item Put a ? after the quantifier: \item *? {Match 0 or more times} \item +? {Match 1 or more times} \item ?? {Match 0 or 1 time} \item \{n,\}? {Match at least n times} \item \{n,m\}? {Match at least n, but no more than m times} \end{itemize} \end{slide} \begin{slide}{Being Less Greedy: Example} \begin{itemize} \item We can solve the problem we saw earlier using non-greedy matching: \item \$\_ = 'He said, "Hi there!", and then "What\ensuremath{\backslash}'s up?"'; \item print "\$1\ensuremath{\backslash}n" if /"([\^{}"]*)"/; \item print "\$1\ensuremath{\backslash}n" if /"(.*?)"/; \item These both work, and match only\\Hi there! \end{itemize} \end{slide} \begin{slide}{Sifting through large amounts of data} \begin{itemize} \item Imagine you need to create computing accounts for thousands of students \item As input, you have data of the form: \item Some heading on the top of each page \item More headings with other content, including blank lines \item A tab character separates the columns \item 123456789 H123456(1) \item 234567890 I234567(2) \item 345678901 J345678(3) \item ... ... \item 987654321 A123456(1) \end{itemize} \end{slide} \begin{slide}{Capturing the Match: }}\textbf{{\LARGE{}(...)} \begin{itemize} \item \# useradd() is a function defined elsewhere \item \# that creates a computer account with \item \# username as first parameter, password as \item \# the second parameter \item \item while ( <> ) \{ \item if ( /\^{}(\ensuremath{\backslash}d\{9\})\ensuremath{\backslash}t([A-Z]\ensuremath{\backslash}d\{6\}\ensuremath{\backslash}([\ensuremath{\backslash}dA]\ensuremath{\backslash}))/ ) \{ \item my \$student\_id = \$1; \item my \$hk\_id = \$2; \item useradd( \$student\_id, \$hk\_id ); \item \} \end{itemize} \end{slide} \begin{slide}{The Substitution Operator }}\textbf{{\LARGE{}s///} \begin{itemize} \item Sometimes want to replace one string with another (editing) \item Example: want to replace Nicholas with Nick on input files: \item \textbf{while ( <> )} \item \textbf{\{} \item \textbf{ \$\_ =\~{} s/Nicholas/Nick/;} \item \textbf{ print \$\_;} \item \textbf{\}} \end{itemize} \end{slide} \begin{slide}{Avoiding leaning toothpicks: }}\textbf{{\LARGE{}/\ensuremath{\backslash}/\ensuremath{\backslash}/} \begin{itemize} \item Want to change a filename, edit the directory in the path from, say /usr/local/bin/filename to /usr/bin/filename \item Could do like this: \item s/\ensuremath{\backslash}/usr\ensuremath{\backslash}/local\ensuremath{\backslash}/bin\ensuremath{\backslash}//\ensuremath{\backslash}/usr/\ensuremath{\backslash}bin\ensuremath{\backslash}//; \item but this makes me dizzy! \item We can do this instead: \item s!\ensuremath{\backslash}/usr/local/bin/!/usr/bin/!; \item Can use any character instead of }/}}{ in \textbf{{s/// \item For }\textit{{matches}}{, can put }m//{, and use any char instead of / \item Can also use parentheses or braces: \item s\{...\}\{...\}}}{ or }\textbf{{m\{...\} \end{itemize} \end{slide} \begin{slide}{Substitution and the }}\LARGE{}/g\textbf{{\LARGE{} modifier} \begin{itemize} \item If an input line contains: \item Nicholas Urbanik read "Nicholas Nickleby" \item then the output is: \item Nick Urbanik read "Nicholas Nickleby" \item How change all the Nicholas in one line? \item Use the }/g{ (global) modifier: \item while ( <> ) \item \{ \item \$\_ =\~{} s/Nicholas/Nick/g; \item print \$\_; \item \} \end{itemize} \end{slide} \begin{slide}{Making regular expressions readable: /x modifier} \begin{itemize} \item Sometimes regular expressions can get long, and need comments inside so others (or you later!) understand \item Use /x at the end of s///x or m//x \item Allows white space, newlines, comments \item \end{document}