#!/usr/bin/env python # FINDING WORDS # How does it work? # \b a word boundary (a space or beginning of a line, orpunctions)... # w a w, followed by... # o an o, followed by... # r a r, then... # d a d, and finally... # \b a word boundary at the end of the word import re r = re.compile(r'\bword\b',re.M ) if r.search( open( 'sample.txt' ).read( ) ) : print "I finally found what I'm looking for." , else: print "\"word\"s not here, man." , #!/usr/bin/env python # FINDING MULTIPLE WORDS WITH ONE SEARCH # How does it work? # Starting outside working in, this expression searches for something # that's surrounded by whitespace. For example: # \s whitespace... # + found one or more times... # (...) followed by something... # \s followed by whitespace... # + that occurs one or more times. # The something here is another expression, (moo)|(oink). # This expression is as follows: # ( a group that contains... # m an m, followed by... # o an o, then.. # o an o... # ) the end of the group... # | or... # ( a group that contains... # o an o, followed by... # i an i, then... # n an n, followed by... # k a k... # ) the end of a group import re r = re.compile(r'\s+((moo)|(oink))\s+') if r.search( open( 'sample.txt' ).read( ) ) : print "I spy a cow or pig.", else: print "Ah, there's no cow or pig here.", #!/usr/bin/python # FINDING VARIATIONS ON WORDS (John, Jon, Jonathan) # How does it work? # J followed by... # o then... # h that is... # ? optional, followed by... # n followed by... # (...) a goup of characters... # ? that may oppear once, but isn't required, followed by... # a space, followed by # D then... # o and finally... # e at the end. import re import sys nargs = len(sys.argv) if nargs > 1: mystr = sys.argv[1] r = re.compile(r'Joh?n(athan)? Doe', re.M) if r.search(mystr): print 'Here\'s Johnny!', else: print 'Who?', else: print 'I came here for an argument!', #!/usr/bin/python # FINDING SIMILAR WORDS (Bat, Cat, Mat) # How does it work? # \b is a word boundary, followed by... # [bcm] one of b, c, or m, followed by... # a then... # t and finally... # \b a word boundary. import re import sys nargs = len(sys.argv) if nargs > 1: mystr = sys.argv[1] r = re.compile(r'\b[bcm]at\b', re.M) if r.search(mystr): print 'I spy a bat, a cat, or a mat', else: print 'I don\'t spy nuttin\', honey', else: print 'Come again?', #!/usr/bin/python # REPLACING TAB CHARACTERS # How does it work? # \t is a tab, replaced by... # | a pipe character import re import sys nargs = len(sys.argv) if nargs > 1: mystr = sys.argv[1] r = re.compile(r'\t', re.M) returnstr = r.sub( ',', open( mystr ).read( ) ) print returnstr, else: print 'Come again?', #!/usr/bin/python # SEARCHING FOR REPEATED WORDS ACROSS MULTIPLE LINES # How does it work? # \b is a word boundary, followed by... # (...) a group (explained next), then... # \s a space # + one or more times, then... # \1 whatever was found in the group, and lastly... # \b a word boundary # The group is simply(\w+), which is as follows: # \w is a word character... # + found one or more times. import re import sys nargs = len(sys.argv) if nargs > 1: mystr = sys.argv[1] r = re.compile(r'\b(\w+)\s+\1\b', re.M ) if r.match( open( mystr ).read( ) ) : print "Found double words", else: print "No match here.", else: print "I came here for an argument.", #!/usr/bin/python # SPLITTING LINES IN A FILE # How does it work? # , is a comma followed by... # \s a space... # * none or many times. # The replacement expression is simply as follows: # , is a comma followed by # \n a newline character. import re import sys nargs = len(sys.argv) if nargs > 1: mystr = sys.argv[1] r = re.compile( r',\s*', re.M ) newstr = r.sub( ',\n', open( mystr ).read( ) ) print newstr, else: print 'Filename? Anyone? Anyone?', #!/usr/bin/python # JOINING LINES IN A FILE # How does it work? # \n is a newline, replaced by... # , is a comma followed by... # a space. import re import sys nargs = len(sys.argv) if nargs > 1: mystr = sys.argv[1] r = re.compile( r'\n', re.M ) newstr = r.sub( ', ', open( mystr ).read( ) ) print newstr, else: print 'Filename? Anyone? Anyone?', #!/usr/bin/python # FILTERING THE OUTPUT OF DU # How does it work? # ^ the beginning of the line... # (?: a noncapturing group that contains... # (?: a noncapturing group that contains... # [2-9] a digit two through nine... # [0-9] a digit zero through nine... # {2} found two times... # M an M (printed by du -h for megabytes)... # ) the end of the inside group... # | or... # (?: another noncapturing group that contains... # [0-9.] a character class that contains zero through nine or a literal... # + found one of more times... # G a G... # ) the end of the group... # ) the end of the outside group... # \s whitespace... # + one or more times... # ( a group that contains... # . any character... # * found zero, one, or many times... # $ the end of the line. import re from os import popen output = popen( 'du -hs /Users/myusername/*' ) regex = re.compile( r'^(?:(?:[2-9][0-9]{2}M)|(?:[0-9.]+G))\s+(.*)$' ) lines = output.readlines() for line in lines: if regex.match( line ): formatted = regex.sub( r'You have more than 200MB in directory: \1', line ) print formatted, output.close() #!/usr/bin/python # CHANGING DOS TEXT TO UNIX TEXT # How does it work? # Windows text files have extra line characters at the end of the line, # and this can sometimes cause problems on a Unix or Linux box. # This can especially rear its ugly head when a text file is transferred # as a binary over an FTP session. (ASCII mode on most FTP servers takes # the extra line terminator out during transfer.) This can cause issues # with shell scripts, and so on. # This simple recipe will remove the extra character, which is simply thid:\r. # It's removed here by replacing it with a zero-length string. import re import sys nargs = len(sys.argv) if nargs > 1: myfile = sys.argv[1] output = open( myfile ) regex = re.compile( r'\r' ) lines = output.readlines() for line in lines: formatted = regex.sub( r'', line ) print formatted, output.close() else: print 'I\'ve seen Tron eight times', #!/usr/bin/python # SEARCHING FOR A SUBJECT IN MAIL FILES # How does it work? # ^ the beginning of the line... # [ a character class that contains... # a space... # > a >... # ] the end of the character class... # * zero, one, or more times... # Subject: the subject label. import re import sys from os import popen nargs = len(sys.argv) if nargs > 1: myfile = sys.argv[1] output = open( myfile ) regex = re.compile( r'^[ >]*Subject:' ) lines = output.readlines() for line in lines: if regex.match( line ): print line, output.close() else: print 'Please supply a parameter!', #!/usr/bin/python # PARSING THE OUTPUT OF DF # How does it work? # \s whitespace... # ( a group that contains... # ( another group with... # [5-9] five through nine (in the tens place)... # [0-9] zero through nine (in the ones place)... # ) the end of the group # | or... # 100 1, 0, and 0... # ) the end of the group... # \% a percent sign... # \s whitespace. import re from os import popen output = popen( 'df' ) regex = re.compile( r'^.*\s(([5-9][0-9])|(100))%\s.*$' ) lines = output.readlines() for line in lines: if regex.match( line ): print line, output.close()