%% ================================================================================
%% This LaTeX file was created by AbiWord.                                         
%% AbiWord is a free, Open Source word processor.                                  
%% You may obtain more information about AbiWord at www.abisource.com              
%% ================================================================================
\documentclass[12pt]{article}
\usepackage[T1]{fontenc}
\usepackage{calc}
\usepackage{hyperref}\usepackage{setspace}
\usepackage{multicol}
\usepackage[normalem]{ulem}
\usepackage{color}
\setlength{\oddsidemargin}{1.250000in-1in}
\setlength{\textwidth}{\paperwidth - 1.250000in-1.250000in}
\begin{document}
\begin{center}
\end{itemize}
\end{slide}
\begin{slide}{Perl and Regular Expressions}
\begin{itemize}
\end{center}
\begin{center}
Regular Expressions are available as part of the programming languages Java, JScript, Visual Basic and VBScript, JavaScript, C, C++, C\#, elisp, Perl, Python, Ruby, PHP, sed, awk, and in many applications, such as editors, grep, egrep.
\end{center}
\begin{center}
Regular Expressions help you master your data.
\end{center}
\end{itemize}
\end{slide}
\begin{slide}{What is a Regular Expression?}
\begin{itemize}
\item Powerful.
\item Low level description:
\item Describes some text
\item Can use to:
\item Verify a user's input
\item Sift through large amounts of data
\item High level description:
\item Allow you to master your data
\end{itemize}
\end{slide}
\begin{slide}{Regular Expressions as a language}
\begin{itemize}
\item Can consider regular expressions as a language
\item Made of two types of characters:
\item \textit{Literal} characters
\item Normal text characters
\item Like words of the program
\item \textit{Metacharacters}
\item The special characters }+ ? . * \^{} \$ ( ) [ \{ | \ensuremath{\backslash
\item Act as the grammar that combines with the words according to a set of rules to create and expression that communicates an idea
\item \begin{center}
\end{itemize}
\end{slide}
\begin{slide}{How to use a Regular Expression}
\begin{itemize}
\end{center}
\begin{center}
How to make a regular expression as part of your program
\end{center}
\end{itemize}
\end{slide}
\begin{slide}{What do they look like?}
\begin{itemize}
\item In Perl, a regular expression begins and ends with }/}}{, like this: \textbf{{/abc/
\item /abc/{ }{matches the string "abc"}
\item Are these literal characters or metacharacters?
\item Returns true if matches, so often use as condition in an if statement
\end{itemize}
\end{slide}
\begin{slide}{Example: searching for "Course:"}
\begin{itemize}
\item Problem: want to print all lines in all input files that contain the string "Course:"
\item while ( <> ) \{
\item     my \$line = \$\_;
\item     if ( \$line =\~{} /Course:/ ) \{
\item         print \$line;
\item     \}
\item \}
\item Or more concisely:
\item while ( <> ) \{
\item     print if \$\_ =\~{} /Course:/;
\item \}
\end{itemize}
\end{slide}
\begin{slide}{The "match operator" }}\LARGE{}=\~{
\begin{itemize}
\item If just use /Course:/, this returns true if \$\_ contains the string Course:
\item If want to test another string variable }\$var{  to see if it contains the regular expression, use
\item \$var =\~{} /regular expression/
\item Under what condition is this true?
\end{itemize}
\end{slide}
\begin{slide}{The "match operator" }}\LARGE{}=\~{}	   \textbf{{\LARGE{}2}
\begin{itemize}
\item \# sets the string to be searched:
\item \$\_ = "perl for Win32";
\item \# is 'perl' inside \$\_? 
\item if ( \$\_ =\~{} /perl/ ) \{ print "Found perl\ensuremath{\backslash}n" \};
\item \item \# Same as the regex above.
\item \# Don't need the =\~{} as we are testing \$\_:
\item if ( /perl/ )     \{ print "Found perl\ensuremath{\backslash}n" \};
\end{itemize}
\end{slide}
\begin{slide}{/i}}\textbf{{\LARGE{} Matching without case sensitivity}
\begin{itemize}
\item \$\_ = "perl for Win32";
\item \# this will fail because the case doesn't match:
\item if ( /PeRl/ )     \{ print "Found PeRl\ensuremath{\backslash}n" \};
\item \# this will match, because there is an 'er' in 'perl':
\item if ( /er/ )       \{ print "Found er\ensuremath{\backslash}n" \};
\item \# this will match, because there is an 'n3' in 'Win32':
\item if ( /n3/ )       \{ print "Found n3\ensuremath{\backslash}n" \};
\item \# this will fail because the case doesn't match:
\item if ( /win32/ )    \{ print "Found win32\ensuremath{\backslash}n" \};
\item \item \# This matches because the /i at the end means
\item \# "match without case sensitivity":
\item if ( /win32/i )   \{ print "Found win32 (i)\ensuremath{\backslash}n" \};
\end{itemize}
\end{slide}
\begin{slide}{Using }}\LARGE{}!\~{}}}\textbf{{\LARGE{} instead of }}\textbf{{\LARGE{}=\~{
\begin{itemize}
\item \# Looking for a space:
\item print "Found!\ensuremath{\backslash}n"  if      / /;
\item \# both these are the same, but reversing the logic with
\item \# unless and !\~{}
\item print "Found!!\ensuremath{\backslash}n" unless \$\_ !\~{} / /;
\item print "Found!!\ensuremath{\backslash}n" unless    ! / /;
\end{itemize}
\end{slide}
\begin{slide}{Embedding variables in regexps}
\begin{itemize}
\item \# Create two variables containing regular expressions
\item \# to search for:
\item my \$find = 32;
\item my \$find2 = " for ";
\item \item if ( /\$find/ )  \{ print "Found '\$find'\ensuremath{\backslash}n" \};
\item if ( /\$find2/ ) \{ print "Found '\$find2'\ensuremath{\backslash}n" \};
\item \item \# different way to do the above:
\item print "Found \$find2\ensuremath{\backslash}n" if /\$find2/; 
\item \begin{center}
\end{itemize}
\end{slide}
\begin{slide}{The Metacharacters}
\begin{itemize}
\end{center}
\begin{center}
The funny characters
\end{center}
\begin{center}
What they do
\end{center}
\begin{center}
How to use them
\end{center}
\end{itemize}
\end{slide}
\begin{slide}{Character Classes		}}\textbf{{\LARGE{}[...]}
\begin{itemize}
\item my @names = ( "Nick", "Albert", "Alex", "Pick" );
\item foreach my \$name ( @names ) \{
\item     if ( \$name =\~{} /[NP]ick/ ) \{
\item         print "\$name: Out for a Pick Nick\ensuremath{\backslash}n";
\item     else \{
\item         print "\$name is not Pick or Nick\ensuremath{\backslash}n";
\item     \}
\item \}
\item Square brackets \textit{match a single character}
\end{itemize}
\end{slide}
\begin{slide}{Examples of use of }}\textbf{{\LARGE{}[...]}
\begin{itemize}
\item Match a capital letter: [ABCDEFGHIJKLMNOPQRSTUVWXYZ]
\item Same thing: [A-Z]
\item Match a vowel: [aeiou]
\item Match a letter or digit: [A-Za-z0-9]
\end{itemize}
\end{slide}
\begin{slide}{Negated character class: }}\textbf{{\LARGE{}[\^{}...]}
\begin{itemize}
\item Match any single character that is }\textit{{not}}{ a letter: }[\^{A-Za-z]
\item Match any character that is not a space or a tab: }[\^{} \ensuremath{\backslasht]
\end{itemize}
\end{slide}
\begin{slide}{Example using }}\textbf{{\LARGE{}[\^{}...]}
\begin{itemize}
\item This simple program prints only lines that contain characters that are not a space:
\item while ( <> )
\item \{
\item     print \$\_ if /[\^{} ]/;
\item \}
\item This prints lines that \textit{start with} a character that is not a space:
\item while ( <> )
\item \{
\item     print \$\_ if /\^{}[\^{} ]/;
\item \}
\item Notice that \textbf{\^{}} has two meanings: one inside \textbf{[...]}, the other outside.
\end{itemize}
\end{slide}
\begin{slide}{Shorthand for Common Character Classes}
\begin{itemize}
\item Since matching a digit is very common, Perl provides }\ensuremath{\backslash}d}}{ as a short way of writing \\\textbf{{[0-9]
\item \ensuremath{\backslash}D}}{ matches a non-digit: }\textbf{{[\^{}0-9]
\item \ensuremath{\backslash}s}}{ matches any whitespace character; shorthand for }\textbf{{[ \ensuremath{\backslash}t\ensuremath{\backslash}n\ensuremath{\backslash}r\ensuremath{\backslash}f]
\item \ensuremath{\backslash}S}}{ non-whitespace, }\textbf{{[\^{} \ensuremath{\backslash}t\ensuremath{\backslash}n\ensuremath{\backslash}r\ensuremath{\backslash}f]
\item \ensuremath{\backslash}w}}{ word character, }\textbf{{[a-zA-Z0-9\_]
\item \ensuremath{\backslash}W}}{ non-word character, }\textbf{{[\^{} a-zA-Z0-9\_]
\end{itemize}
\end{slide}
\begin{slide}{Matching any character}
\begin{itemize}
\item The dot matches any character except a newline
\item This matches any line with at least 5 characters:
\item print if /...../;
\end{itemize}
\end{slide}
\begin{slide}{Matching the beginning or end}
\begin{itemize}
\item to match a line that contains exactly five characters:
\item print if /\^{}.....\$/;
\item the \^{} matches the beginning of the line.
\item the \$ matches at the end of the line
\end{itemize}
\end{slide}
\begin{slide}{Matching Repetitions: }}\LARGE{}* + ? \{n,m\
\begin{itemize}
\item To match zero or more:
\item /a*/ will match zero or more letter a, so matches "", "a", "aaaa", "qwereqwqwer", or the nothing in front of }\textit{{anything}}{!
\item to match at least one:
\item /a+/ matches at least one "a"
\item /a?/ matches zero or one "a"
\item /a\{3,5\}/ matches between 3 and 5 "a"s.
\end{itemize}
\end{slide}
\begin{slide}{Example using .*}
\begin{itemize}
\item \$\_ = 'Nick Urbanik <nicku@nicku.org>';
\item print "found something in <>\ensuremath{\backslash}n" if /<.*>/;
\item \item \# Find everything between quotes:
\item \$\_ = 'He said, "Hi there!", and then "What\ensuremath{\backslash}'s up?"';
\item print "quoted!\ensuremath{\backslash}n" if /"[\^{}"]*"/;
\item print "too much!\ensuremath{\backslash}n" if /".*"/;
\end{itemize}
\end{slide}
\begin{slide}{Capturing the Match with }}\textbf{{\LARGE{}(...)}
\begin{itemize}
\item Often want to scan large amounts of data, extracting important items
\item Use parentheses and regular expressions
\item Silly example of capturing an email address:
\item \$\_ = 'Nick Urbanik <nicku@nicku.org>';
\item print "found \$1 in <>\ensuremath{\backslash}n" if /<(.*)>/;
\end{itemize}
\end{slide}
\begin{slide}{Capturing the match: greediness}
\begin{itemize}
\item Look at this example:
\item \$\_ = 'He said, "Hi there!", and then "What\ensuremath{\backslash}'s up?"';
\item print "\$1\ensuremath{\backslash}n" if /"([\^{}"]*)"/;
\item print "\$1\ensuremath{\backslash}n" if /"(.*)"/;
\item What will each print?
\item The first one works; the second one prints:\\Hi there!", and then "What's up?
\item Why?
\item Because *, ?, +, \{m,n\} are }\textit{{greedy}}{!
\item They match as much as they possibly can!
\end{itemize}
\end{slide}
\begin{slide}{Being Stingy (not Greedy): }}\textbf{{\LARGE{}?}
\begin{itemize}
\item Usually greedy matching is what we want, but not always
\item How can we match as little as possible?
\item Put a ? after the quantifier:
\item *?		 {Match 0 or more times}
\item +?		 {Match 1 or more times}
\item ?? 	 {Match 0 or 1 time}
\item \{n,\}?	 {Match at least n times}
\item \{n,m\}? {Match at least n, but no more than m times}
\end{itemize}
\end{slide}
\begin{slide}{Being Less Greedy: Example}
\begin{itemize}
\item We can solve the problem we saw earlier using non-greedy matching:
\item \$\_ = 'He said, "Hi there!", and then "What\ensuremath{\backslash}'s up?"';
\item print "\$1\ensuremath{\backslash}n" if /"([\^{}"]*)"/;
\item print "\$1\ensuremath{\backslash}n" if /"(.*?)"/;
\item These both work, and match only\\Hi there!
\end{itemize}
\end{slide}
\begin{slide}{Sifting through large amounts of data}
\begin{itemize}
\item Imagine you need to create computing accounts for thousands of students
\item As input, you have data of the form:
\item Some heading on the top of each page
\item More headings with other content, including blank lines
\item A tab character separates the columns
\item 123456789	H123456(1)
\item 234567890	I234567(2)
\item 345678901	J345678(3)
\item ...		...
\item 987654321	A123456(1)
\end{itemize}
\end{slide}
\begin{slide}{Capturing the Match:	   }}\textbf{{\LARGE{}(...)}
\begin{itemize}
\item \# useradd() is a function defined elsewhere
\item \# that creates a computer account with
\item \# username as first parameter, password as
\item \# the second parameter
\item \item while ( <> ) \{
\item     if ( /\^{}(\ensuremath{\backslash}d\{9\})\ensuremath{\backslash}t([A-Z]\ensuremath{\backslash}d\{6\}\ensuremath{\backslash}([\ensuremath{\backslash}dA]\ensuremath{\backslash}))/ ) \{
\item         my \$student\_id = \$1;
\item         my \$hk\_id = \$2;
\item         useradd( \$student\_id, \$hk\_id );
\item \}
\end{itemize}
\end{slide}
\begin{slide}{The Substitution Operator }}\textbf{{\LARGE{}s///}
\begin{itemize}
\item Sometimes want to replace one string with another (editing)
\item Example: want to replace Nicholas with Nick on input files:
\item \textbf{while ( <> )}
\item \textbf{\{}
\item \textbf{    \$\_ =\~{} s/Nicholas/Nick/;}
\item \textbf{    print \$\_;}
\item \textbf{\}}
\end{itemize}
\end{slide}
\begin{slide}{Avoiding leaning toothpicks: }}\textbf{{\LARGE{}/\ensuremath{\backslash}/\ensuremath{\backslash}/}
\begin{itemize}
\item Want to change a filename, edit the directory in the path from, say /usr/local/bin/filename to /usr/bin/filename
\item Could do like this:
\item s/\ensuremath{\backslash}/usr\ensuremath{\backslash}/local\ensuremath{\backslash}/bin\ensuremath{\backslash}//\ensuremath{\backslash}/usr/\ensuremath{\backslash}bin\ensuremath{\backslash}//;
\item but this makes me dizzy!
\item We can do this instead:
\item s!\ensuremath{\backslash}/usr/local/bin/!/usr/bin/!;
\item Can use any character instead of }/}}{ in \textbf{{s///
\item For }\textit{{matches}}{, can put }m//{, and use any char instead of /
\item Can also use parentheses or braces:
\item s\{...\}\{...\}}}{ or }\textbf{{m\{...\}
\end{itemize}
\end{slide}
\begin{slide}{Substitution and the }}\LARGE{}/g\textbf{{\LARGE{} modifier}
\begin{itemize}
\item If an input line contains:
\item Nicholas Urbanik read "Nicholas Nickleby"
\item then the output is:
\item Nick Urbanik read "Nicholas Nickleby"
\item How change all the Nicholas in one line?
\item Use the }/g{  (global) modifier:
\item while ( <> )
\item \{
\item     \$\_ =\~{} s/Nicholas/Nick/g;
\item     print \$\_;
\item \}
\end{itemize}
\end{slide}
\begin{slide}{Making regular expressions readable: /x modifier}
\begin{itemize}
\item Sometimes regular expressions can get long, and need comments inside so others (or you later!) understand
\item Use /x at the end of s///x or m//x
\item Allows white space, newlines, comments
\item \end{document}