\documentclass{ictlab} \RCS $Revision: 1.3 $ \usepackage{alltt,key,biganswerbox} %\usepackage[hang,bf,nooneline]{caption2} \usepackage[flushleft]{caption2} \ifx\pdftexversion\undefined \usepackage{url} \else \usepackage[pdfpagemode=None,pdfauthor={Nick Urbanik}]{hyperref} \fi \newcommand*{\screenwidth}{0.4\textwidth} \newcommand*{\labTitle}{Regular Expressions, \texttt{sed} and \texttt{awk}} \renewcommand*{\subject}{Operating Systems and Systems Integration} \renewcommand*{\bs}{\texttt{\char '134}} % Backslash `\' \providecommand*{\DHCP}{\acro{DHCP}\xspace} \newlength{\chainsawwidth} \setlength{\chainsawwidth}{3cm} \includeversion{questionsonly} \begin{solution} \excludeversion{questionsonly} \end{solution} \begin{document} \section{Background} \label{sec:background} For the background information you need to answer these questions, please refer to the shell programming lecture notes. The program \texttt{egrep} stands for \emph{extended} \texttt{grep}. It supports so-called \emph{extended regular expressions}. Well, since I have taught you the syntax of extended regular expressions in the lecture, I suggest that you use \texttt{egrep} rather than \texttt{grep} with regular expressions. Both \texttt{grep} and \texttt{egrep} support an option \texttt{-o} to print \emph{only} the part of the line that matches the expression. The \GNU \texttt{sed} program supports the \texttt{-r} option that tells \texttt{sed} to use regular expressions like \texttt{egrep}. So when you use \texttt{sed}, use it with \texttt{sed~-r}. The \GNU \texttt{awk} program has an option \texttt{--posix} that makes \texttt{awk} behave like \texttt{egrep} so that it understands the use of \{$n$\}, \{$n$\texttt{,}$m$\} without having to put in extra backslashes `\bsnb'. \begin{explanation} You might wonder what is the difference between ``extended regular expressions'' and the regular expressions that \texttt{grep} uses? The difference is explained at the end of the \texttt{info} page: \begin{alltt} $ \textbf{info '(grep)Regular Expressions'} \end{alltt}%$ where it says, ``In basic regular expressions the metacharacters `?', `+', `\{', `\textbar', `(', and `)' lose their special meaning; instead use the backslashed versions `\bsnb?', `\bsnb+', `\bsnb\{', `\bsnb\textbar', `\bsnb(', and `\bsnb)'.'' \end{explanation} \section{Questions} \label{sec:questions} \subsection{\texttt{egrep}ping through the dictionary} \label{sec:dictionary} Your dictionary is a file \path{/usr/share/dict/words}. Use \texttt{egrep} to: \begin{enumerate} \item Find all words containing three letter `a's. \begin{solution} \begin{alltt} $ \texttt{egrep 'a.*a.*a' /usr/share/dict/words} \end{alltt}%$ \end{solution} \item Find all words containing \emph{no} vowels. (A \emph{vowel} is one of the letters `a', `e', `i', `o' and `u'.) \begin{solution} \begin{alltt} $ \texttt{LANG=C egrep -i '^[^aeiou]+$' /usr/share/dict/words} \end{alltt} Note that \texttt{LANG=C} means, ``Don't use Unicode, use just plain old \acro{ASCII}.'' \end{solution} \item Find all words containing \emph{at least} 5 vowels. Count the number of matching words. \begin{solution} \begin{alltt} $ \texttt{egrep '[aeiou][^aeiou]*[aeiou][^aeiou]*'\bs '[aeiou][^aeiou]*[aeiou][^aeiou]*[aeiou]' \bs /usr/share/dict/words | wc -l} 5306 \end{alltt}%$ \end{solution} \item Find all words containing \emph{exactly} 5 vowels. Count the number of matching words. \begin{solution} \begin{alltt} $ \texttt{egrep '^[^aeiou]*[aeiou][^aeiou]*[aeiou][^aeiou]*'\bs '[aeiou][^aeiou]*[aeiou][^aeiou]*[aeiou][^aeiou]*$' \bs /usr/share/dict/words | wc -l} 3798 \end{alltt} \end{solution} \end{enumerate} \subsection{\texttt{egrep}: Selecting data from student records} \label{sec:student-records} \newlength{\boxht} \setlength{\boxht}{9mm} \begin{enumerate} \item \label{que:get-artificial-student-data}Save the file %\sloppypar \url{http://ictlab.tyict.vtc.edu.hk/snm/lab/regular-expressions/artificial-student-data.txt} %\url{http://ictlab.tyict.vtc.edu.hk/ossi/lab/regex-sed-awk/artificial-student-data-2004.txt} to your local directory. For this data, write a regular expression that will select each of the following. Test it on the data using \texttt{egrep~-o} \item student number \begin{biganswerbox}[\boxht]% \begin{solution}% \begin{alltt} $ \textbf{egrep -o '[0-9]\{9\}' artificial-student-data.txt} \end{alltt}%$ \end{solution} \end{biganswerbox} \item \label{que:hkid}Hong Kong ID. Count the number of Hong Kong \ID{}s. \begin{biganswerbox}[\boxht]% \begin{solution}% \begin{alltt} $ \textbf{egrep -oi '[A-Z][0-9]\{6\}\bs([A0-9]\bs)' \bs artificial-student-data.txt | wc -l} 560 \end{alltt}%$ \end{solution} \end{biganswerbox} %% \item The class of a particular student %% \begin{biganswerbox}[\boxht]% %% \begin{solution}% %% \mbox{} %% \end{solution} %% \end{biganswerbox} \item the course code. Count the number of courses. \par\vspace*{-1.5ex}\par \begin{explanation} The course and year are shown in this case on the sixth line: \texttt{2241/2}. The course is 2241; this is the second year of study. \end{explanation} \begin{biganswerbox}[\boxht]% \begin{solution}% \begin{alltt} $ \textbf{egrep -o '^[0-9]\{4\}' artificial-student-data.txt} $ \textbf{egrep -o '^[0-9]\{4\}' artificial-student-data.txt \bs | sort -n | uniq -c} 3 2241 4 2245 1 2262 4 2545 3 2565 \end{alltt} \end{solution} \end{biganswerbox} \item the year of study \begin{biganswerbox}[\boxht]% \begin{solution}% \begin{alltt} $ \textbf{egrep -o '^[0-9]\{4\}/[0-9]' artificial-student-data.txt \bs | sed 's!.*/!!'} \end{alltt}%$ \end{solution} \end{biganswerbox} \item The company the student works for \begin{biganswerbox}[\boxht]% \begin{solution}% Here is one possible solution: % egrep '[A-Z][0-9]{6}\([A0-9]\)' artificial-student-data.txt \ % | cut -b80-125 | grep -v '^ *$' | sed 's/ *$//' \begin{alltt} $ \textbf{egrep '[A-Z][0-9]\{6\}\bs([A0-9]\bs)' artificial-student-data.txt \bs | cut -b80-125 \bs | grep -v '^ *$' \bs | sed 's/ *$//'} \end{alltt}%$ This works by: \begin{itemize} \item listing all lines containing a \acro{HKID}; \item selecting only the columns containing the Company name, with \texttt{cut}, i.e., columns 80 to 125; \item remove blank lines (student records with no company) \item removing the trailing spaces from the end of each line. \end{itemize} There are many other methods. \end{solution} \end{biganswerbox} \item The home telephone number \begin{biganswerbox}[\boxht]% \begin{solution}% This one just finds the 8 digit number that comes after the Hong Kong \ID{}: \begin{alltt} $ \textbf{egrep '[A-Z][0-9]\{6\}\bs([A0-9]\bs) +[0-9]\{8\}' \bs artificial-student-data.txt \bs | awk '\{print $2\}'} \end{alltt} \end{solution} \end{biganswerbox} \item The gender of the student \begin{biganswerbox}[\boxht]% \begin{solution}% We can find the M or F just before a nine digit number: \begin{alltt} $ \textbf{egrep -o '[MF] +[0-9]{9}' artificial-student-data.txt \bs | awk '\{print $1\}'} $ \textbf{egrep -o '[MF] +[0-9]{9}' artificial-student-data.txt \bs | awk '\{print $1\}' \bs | sort \bs | uniq -c} 66 F 494 M \end{alltt} The second pipeline counts the number of males and the number of females. \end{solution} \end{biganswerbox} \item The student's name \begin{biganswerbox}[\boxht]% \begin{solution}% \begin{alltt} $ \textbf{egrep '[A-Z][0-9]\{6\}\bs([A0-9]\bs)' artificial-student-data.txt \bs | cut -b6-42 \bs | sed 's/ *$//'} \end{alltt} This solution uses the same approach as the pipline that prints the company names. \end{solution} \end{biganswerbox} \label{que:last} \end{enumerate} \subsection{Using \texttt{sed}} \label{sec:sed} Write a \texttt{sed} expression to output \emph{only} the data for which you wrote \emph{each} of the regular expressions above. For example, write a \texttt{sed} command that will print \emph{only} the HK \ID{}s and \emph{all} the HK \ID{}s from the file, using the regular expression you wrote for question~\vref{que:hkid}. You should write eight \texttt{sed} expressions. \begin{enumerate} \stepcounter{enumi} \item student number \begin{solution} \par\medskip\par \begin{alltt} $ \textbf{sed -rn 's/.*([0-9]\{9\}).*/\bs1/p' artificial-student-data.txt} \end{alltt}%$ \end{solution} \item Hong Kong ID. \begin{solution} \par\medskip\par \begin{alltt} $ \textbf{sed -rn 's/.*([A-Za-z][0-9]\{6\}\bs([Aa0-9]\bs)).*/\bs1/p' \bs artificial-student-data.txt} \end{alltt}%$ \end{solution} \item the course code. \begin{solution} \par\medskip\par \begin{alltt} $ \textbf{sed -rn 's/^([0-9]\{4\}).*/\bs1/p' artificial-student-data.txt} \end{alltt}%$ \end{solution} \item the year of study \begin{solution} \par\medskip\par \begin{alltt} $ \textbf{sed -rn 's!^[0-9]\{4\}/([0-9]).*!\bs1!p' artificial-student-data.txt} \end{alltt}%$ \end{solution} \item The company the student works for \begin{solution} \par\medskip\par % sed -rn 's/^.{79}([A-Za-z0-9()/&",+.'\''-]+( % [A-Za-z0-9()/&",+.'\''-]+)+) .*/\1/p' artificial-student-data.txt Here is one rather complicated solution: \begin{alltt} $ \textbf{sed -rn 's/^.\{79\}([A-Za-z0-9()/&",+.'\bs''-]+'\bs '( [A-Za-z0-9()/&",+.'\bs''-]+)+) .*/\bs1/p' artificial-student-data.txt} \end{alltt}%$ Part of the complication is that you cannot put a single quote ``\texttt{'}'' in a single-quoted string in the shell. To achieve this effect, you need to: \begin{itemize} \item end the single quoted string with ``\texttt{'}'', then \item quote a single quote with a backslash like this: ``\texttt{\bs'}'', then \item start a new single-quoted string with ``\texttt{'}''. \end{itemize} This means that to put the word ``it's'' in a single quoted string in the shell, we would need to write something as horrible as this: \begin{alltt} 'it'\bs''s' \end{alltt} Another reason why the character class ``\texttt{[A-Za-z0-9()/\&",+.'\bs''-]}'' above looks so long and horrible is that there are so many different characters in the company names. The ``\texttt{-}'' needs to go at the end of the character class, otherwise it means a range of characters, as in ``\texttt{[A-Z}''. Here is another that just uses the \emph{position} of the company data: \begin{alltt} $ \textbf{sed -rn 's/^.\{79\}([^ ]+( [^ ]+)+) .*/\bs1/p' \bs artificial-student-data.txt} \end{alltt}%$ \end{solution} \item The home telephone number \begin{solution} \par\medskip\par We can just use the same approach as before: find the 8-digit number after the HK \ID: \begin{alltt} $ \textbf{sed -rn 's/[A-Z][0-9]\{6\}\bs([A0-9]\bs) +([0-9]\{8\}) .*/\bs1/p' \bs artificial-student-data.txt \bs} \end{alltt}%$ \end{solution} \item The gender of the student \begin{solution} \par\medskip\par We can find the M or F just before a nine digit number: \begin{alltt} $ \textbf{sed -rn 's/ ([MF]) +[0-9]\{9\} /\bs1/p' artificial-student-data.txt} \end{alltt}%$ \end{solution} \item The student's name \begin{solution} \par\medskip\par Just for variety, I'll take a different approach here from the approach I took when using \texttt{egrep} above. I'll find the name just before the gender, which is just before the nine-digit student number: % sed -rn 's/.{,6}([A-Za-z]{2,},?( [A-Za-z]+)+) +[MF] +[0-9]{9} .*/\1/ip' \ % artificial-student-data.txt \begin{alltt} $ \textbf{sed -rn 's/.\{,6\}([A-Za-z]\{2,\},?( [A-Za-z]+)+) +[MF] +[0-9]\{9\} .*/\bs1/p' \bs artificial-student-data.txt} \end{alltt}%$ Note that we cannot put `\textbf{.*}' at the beginning of the pattern, because that will gobble up the beginning of the student's name. That is because `\texttt{*}' is ``\emph{greedy},'' and matches as much as it possibly can. We can limit the match to ``at most 6 characters'' with `\texttt{.\{,6\}}'. \end{solution} \end{enumerate} \subsection{Using \texttt{awk}} \label{sec:awk} Use \texttt{awk} and \texttt{ls} to add up the size of all the files in your current directory. \begin{biganswerbox}[15mm]% \begin{solution}% \begin{alltt} $ \textbf{ls -l | awk '\{sum += $5\} END\{print sum\}'} \end{alltt} \end{solution} \end{biganswerbox} \end{document}