Re error nothing to repeat at position 0

Hi, I'm trying to open a seq file using the following code: import flirpy.io.seq as fs img_split = fs.Splitter(output_folder='./output', width=320, height=246, exiftool_path='path/t...

Hi,

I’m trying to open a seq file using the following code:

import flirpy.io.seq as fs
img_split = fs.Splitter(output_folder='./output', width=320, height=246, exiftool_path='path/to/exif/exiftool.exe')
img_split.process(['./Ply4 - Constant Direction_Tape0004.SEQ'])

The first two lines are fine but when I try and process the final line I get the following error:


error Traceback (most recent call last)
Input In [6], in <cell line: 1>()
—-> 1 img_split.process([‘./Ply4 — Constant Direction_Tape0004.SEQ’])

File pathRHPlibsite-packagesflirpyioseq.py:144, in Splitter.process(self, file_list)
141 Path(folder).mkdir(exist_ok=True)
143 logger.info(«Splitting {} into {}».format(seq, folder))
—> 144 self._process_seq(seq, folder)
146 # Batch export meta data
147 if self.export_meta:

File pathRHPlibsite-packagesflirpyioseq.py:197, in Splitter._process_seq(self, input_file, output_subfolder)
193 def _process_seq(self, input_file, output_subfolder):
195 logger.debug(«Processing {}».format(input_file))
—> 197 for count, frame in enumerate(tqdm(Seq(input_file, self.height, self.width))):
199 if frame.meta is None:
200 self.frame_count += 1

File pathRHPlibsite-packagestqdmstd.py:1195, in tqdm.iter(self)
1192 time = self._time
1194 try:
-> 1195 for obj in iterable:
1196 yield obj
1197 # Update and possibly print the progressbar.
1198 # Note: does not call self.update(1) for speed optimisation.

File pathRHPlibsite-packagesflirpyioseq.py:79, in Seq.getitem(self, index)
76 offset, chunksize = self.pos[index]
77 chunk = self.seq_blob[offset:offset+chunksize]
—> 79 return Fff(chunk, self.width, self.height)

File pathRHPlibsite-packagesflirpyiofff.py:34, in Fff.init(self, data, width, height)
32 self.height = height
33 self.meta = {}
—> 34 self._find_data_offset_simple(width, height)
35 else:
36 try:

File pathRHPlibsite-packagesflirpyiofff.py:91, in Fff._find_data_offset_simple(self, width, height)
86 def _find_data_offset_simple(self, width, height):
87 search = struct.pack(«<H», width-1)
88 + b»x00x00″
89 + struct.pack(«<H», height-1)
—> 91 valid = re.compile(search)
92 res = valid.search(self.data)
94 self.data_offset = res.end() + 14

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libre.py:252, in compile(pattern, flags)
250 def compile(pattern, flags=0):
251 «Compile a regular expression pattern, returning a Pattern object.»
—> 252 return _compile(pattern, flags)

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libre.py:304, in _compile(pattern, flags)
302 if not sre_compile.isstring(pattern):
303 raise TypeError(«first argument must be string or compiled pattern»)
—> 304 p = sre_compile.compile(pattern, flags)
305 if not (flags & DEBUG):
306 if len(_cache) >= _MAXCACHE:
307 # Drop the oldest item

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libsre_compile.py:764, in compile(p, flags)
762 if isstring(p):
763 pattern = p
—> 764 p = sre_parse.parse(p, flags)
765 else:
766 pattern = None

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libsre_parse.py:948, in parse(str, flags, state)
945 state.str = str
947 try:
—> 948 p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
949 except Verbose:
950 # the VERBOSE flag was switched on inside the pattern. to be
951 # on the safe side, we’ll parse the whole thing again…
952 state = State()

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libsre_parse.py:443, in _parse_sub(source, state, verbose, nested)
441 start = source.tell()
442 while True:
—> 443 itemsappend(_parse(source, state, verbose, nested + 1,
444 not nested and not items))
445 if not sourcematch(«|»):
446 break

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libsre_parse.py:668, in _parse(source, state, verbose, nested, first)
666 item = None
667 if not item or item[0][0] is AT:
—> 668 raise source.error(«nothing to repeat»,
669 source.tell() — here + len(this))
670 if item[0][0] in _REPEATCODES:
671 raise source.error(«multiple repeat»,
672 source.tell() — here + len(this))

error: nothing to repeat at position 0

exif tools tells me it is:
File Permissions : -rw-rw-rw-
File Type : SEQ
File Type Extension : seq
MIME Type : image/x-flir-seq
Creator Software : CAMCTRL
Camera Model : FLIR SC325
Raw Thermal Image Width : 320
Raw Thermal Image Height : 246
Raw Thermal Image Type : TIFF
Raw Thermal Image : (Binary data 157644 bytes, use -b option to extract)

(and a bunch of stuff I don’t think is relevant)

Is this another case of the endiness biting me? This is the first thing I’ve done with a thermal camera so apologies if I’ve missed anything obvious.

Thanks,

Tim

Содержание

  1. re.error: nothing to repeat at position 2 #568
  2. Comments
  3. error: nothing to repeat at position 0 #66
  4. Comments
  5. Python Regex Multiple Repeat Error
  6. How Does the Multiple Repeat Error Arise in Python Re?
  7. [Tips] What’s the Source of the Multiple Repeat Error and How to Avoid It?
  8. Python Regex Quantifiers
  9. Alternative Error Message (Fragments)
  10. Where to Go From Here?
  11. Regex Humor

re.error: nothing to repeat at position 2 #568

Great work with Paperless, It’s making my paperwork less time consuming.
Recently I have been getting a regex error while running the consumer. The process quits with the line «re.error: nothing to repeat at position 2» in the Traceback. It seems to relate to the systems Python file «sre_parse.py» and the line 651:

raise source.error(«nothing to repeat», source.tell() — here + len(this))

It seems to interpret the «+» as a repetition symbol in the regexp. I’m wondering how to fix this, since I don’t feel comfortable messing around in the systems Python files.

The text was updated successfully, but these errors were encountered:

Traceback (most recent call last):
File «/usr/src/paperless/src/manage.py», line 11, in
execute_from_command_line(sys.argv)
File «/usr/lib/python3.7/site-packages/django/core/management/init.py», line 371, in execute_from_command_line
utility.execute()
File «/usr/lib/python3.7/site-packages/django/core/management/init.py», line 365, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File «/usr/lib/python3.7/site-packages/django/core/management/base.py», line 288, in run_from_argv
self.execute(*args, **cmd_options)
File «/usr/lib/python3.7/site-packages/django/core/management/base.py», line 335, in execute
output = self.handle(*args, **options)
File «/usr/src/paperless/src/documents/management/commands/document_consumer.py», line 96, in handle
self.loop_inotify(mail_delta)
File «/usr/src/paperless/src/documents/management/commands/document_consumer.py», line 129, in loop_inotify
self.loop_step(mail_delta)
File «/usr/src/paperless/src/documents/management/commands/document_consumer.py», line 121, in loop_step
self.file_consumer.consume_new_files()
File «/usr/src/paperless/src/documents/consumer.py», line 112, in consume_new_files
if not self.try_consume_file(file):
File «/usr/lib/python3.7/contextlib.py», line 74, in inner
return func(*args, **kwds)
File «/usr/src/paperless/src/documents/consumer.py», line 158, in try_consume_file
date
File «/usr/src/paperless/src/documents/consumer.py», line 228, in _store
relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
File «/usr/src/paperless/src/documents/models.py», line 84, in match_all
if tag.matches(text):
File «/usr/src/paperless/src/documents/models.py», line 108, in matches
if re.search(r»b<>b».format(word), text, **search_kwargs):
File «/usr/lib/python3.7/re.py», line 183, in search
return _compile(pattern, flags).search(string)
File «/usr/lib/python3.7/re.py», line 286, in _compile
p = sre_compile.compile(pattern, flags)
File «/usr/lib/python3.7/sre_compile.py», line 764, in compile
p = sre_parse.parse(p, flags)
File «/usr/lib/python3.7/sre_parse.py», line 930, in parse
p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, 0)
File «/usr/lib/python3.7/sre_parse.py», line 426, in _parse_sub
not nested and not items))
File «/usr/lib/python3.7/sre_parse.py», line 651, in _parse
source.tell() — here + len(this))
re.error: nothing to repeat at position 2

Thanks for reporting this @kj21. I think I found a fix and opened PR #571 for this. We’ll let you know once it is fixed!

Источник

error: nothing to repeat at position 0 #66

I’m trying to open a seq file using the following code:

The first two lines are fine but when I try and process the final line I get the following error:

error Traceback (most recent call last)
Input In [6], in ()
—-> 1 img_split.process([‘./Ply4 — Constant Direction_Tape0004.SEQ’])

File pathRHPlibsite-packagesflirpyioseq.py:144, in Splitter.process(self, file_list)
141 Path(folder).mkdir(exist_ok=True)
143 logger.info(«Splitting <> into <>«.format(seq, folder))
—> 144 self._process_seq(seq, folder)
146 # Batch export meta data
147 if self.export_meta:

File pathRHPlibsite-packagesflirpyioseq.py:197, in Splitter._process_seq(self, input_file, output_subfolder)
193 def _process_seq(self, input_file, output_subfolder):
195 logger.debug(«Processing <>«.format(input_file))
—> 197 for count, frame in enumerate(tqdm(Seq(input_file, self.height, self.width))):
199 if frame.meta is None:
200 self.frame_count += 1

File pathRHPlibsite-packagestqdmstd.py:1195, in tqdm.iter(self)
1192 time = self._time
1194 try:
-> 1195 for obj in iterable:
1196 yield obj
1197 # Update and possibly print the progressbar.
1198 # Note: does not call self.update(1) for speed optimisation.

File pathRHPlibsite-packagesflirpyioseq.py:79, in Seq.getitem(self, index)
76 offset, chunksize = self.pos[index]
77 chunk = self.seq_blob[offset:offset+chunksize]
—> 79 return Fff(chunk, self.width, self.height)

File pathRHPlibsite-packagesflirpyiofff.py:34, in Fff.init(self, data, width, height)
32 self.height = height
33 self.meta = <>
—> 34 self._find_data_offset_simple(width, height)
35 else:
36 try:

File pathRHPlibsite-packagesflirpyiofff.py:91, in Fff._find_data_offset_simple(self, width, height)
86 def _find_data_offset_simple(self, width, height):
87 search = struct.pack(» 91 valid = re.compile(search)
92 res = valid.search(self.data)
94 self.data_offset = res.end() + 14

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libre.py:252, in compile(pattern, flags)
250 def compile(pattern, flags=0):
251 «Compile a regular expression pattern, returning a Pattern object.»
—> 252 return _compile(pattern, flags)

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libre.py:304, in _compile(pattern, flags)
302 if not sre_compile.isstring(pattern):
303 raise TypeError(«first argument must be string or compiled pattern»)
—> 304 p = sre_compile.compile(pattern, flags)
305 if not (flags & DEBUG):
306 if len(_cache) >= _MAXCACHE:
307 # Drop the oldest item

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libsre_compile.py:764, in compile(p, flags)
762 if isstring(p):
763 pattern = p
—> 764 p = sre_parse.parse(p, flags)
765 else:
766 pattern = None

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libsre_parse.py:948, in parse(str, flags, state)
945 state.str = str
947 try:
—> 948 p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
949 except Verbose:
950 # the VERBOSE flag was switched on inside the pattern. to be
951 # on the safe side, we’ll parse the whole thing again.
952 state = State()

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libsre_parse.py:443, in _parse_sub(source, state, verbose, nested)
441 start = source.tell()
442 while True:
—> 443 itemsappend(_parse(source, state, verbose, nested + 1,
444 not nested and not items))
445 if not sourcematch(«|»):
446 break

File C:Program FilesWindowsAppsPythonSoftwareFoundation.Python.3.9_3.9.2800.0_x64__qbz5n2kfra8p0libsre_parse.py:668, in _parse(source, state, verbose, nested, first)
666 item = None
667 if not item or item[0][0] is AT:
—> 668 raise source.error(«nothing to repeat»,
669 source.tell() — here + len(this))
670 if item[0][0] in _REPEATCODES:
671 raise source.error(«multiple repeat»,
672 source.tell() — here + len(this))

error: nothing to repeat at position 0

exif tools tells me it is:
File Permissions : -rw-rw-rw-
File Type : SEQ
File Type Extension : seq
MIME Type : image/x-flir-seq
Creator Software : CAMCTRL
Camera Model : FLIR SC325
Raw Thermal Image Width : 320
Raw Thermal Image Height : 246
Raw Thermal Image Type : TIFF
Raw Thermal Image : (Binary data 157644 bytes, use -b option to extract)

(and a bunch of stuff I don’t think is relevant)

Is this another case of the endiness biting me? This is the first thing I’ve done with a thermal camera so apologies if I’ve missed anything obvious.

The text was updated successfully, but these errors were encountered:

Источник

Python Regex Multiple Repeat Error

Just like me an hour ago, you’re probably sitting in front of your regular expression code, puzzled by a strange error message:

Why is it raised? Where does it come from? And, most importantly, how can you get rid of it?

This article gives you answers to all of those questions. Alternatively, you can also watch my short explainer video that shows you real quick how to resolve this error:

How Does the Multiple Repeat Error Arise in Python Re?

Python’s regex library re throws the multiple repeat error when you stack two regex quantifiers on top of each other. For example, the regex pattern ‘a++’ will cause the multiple repeat error. You can get rid of this error by avoiding to stack quantifiers on top of each other.

Here’s an example:

I have shortened the error message to focus on the relevant parts. In the code, you first import the regex library re . You then use the re.findall(pattern, string) function (see this blog tutorial) to find the pattern ‘a++’ in the string ‘aaaa’ .

However, this doesn’t make a lot of sense: what’s the meaning of the pattern a++ anyway? Having a single quantifier a+ already reads as “find all matches where at least one character ‘a’ matches”.

Do you want to master the regex superpower? Check out my new book The Smartest Way to Learn Regular Expressions in Python with the innovative 3-step approach for active learning: (1) study a book chapter, (2) solve a code puzzle, and (3) watch an educational chapter video.

[Tips] What’s the Source of the Multiple Repeat Error and How to Avoid It?

The error happens if you use the Python regex package re . There are many different reasons but all of them have the same source: you stack quantifiers on top of each other.

If you don’t know what a quantifier is, scroll down and read the following subsection where I show you exactly what it is.

Here’s a list of reasons for the error message. Maybe your reason is among them?

  • You use the regex pattern ‘X++’ for any regex expression X . To avoid this error, get rid of one quantifier.
  • You use the regex pattern ‘X+*’ for any regex expression X . To avoid this error, get rid of one quantifier.
  • You use the regex pattern ‘X**’ for any regex expression X . To avoid this error, get rid of one quantifier.
  • You use the regex pattern ‘X*’ for any regex expression X and number of repetitions m and n . To avoid this error, get rid of one quantifier.
  • You try to match a number of characters ‘+’ and use a second quantifier on top of it such as ‘+?’ . In this case, you should escape the first quantifier symbol ‘+’ .
  • You try to match a number of characters ‘*’ and use a second quantifier on top of it such as ‘*+’ . Avoid this error by escaping the first quantifier symbol ‘*’ .

Oftentimes, the error appears if you don’t properly escape the special quantifier meta-characters in your regex pattern.

Here’s a StackOverflow post that shows some code where this happened:

I edited the given code snippet to show the important part. The code fails because of a multiple repeat error . Can you see why?

The reason is that the regex ‘lg incite» OR author:»http++www.dealitem.com» OR «for sale’ contains two plus quantifiers stacked on top of each other in the substring ‘http++’ . Get rid of those and the code will run again!

Python Regex Quantifiers

The word “quantifier“ originates from latin: it’s meaning is quantus = how much / how often.

This is precisely what a regular expression quantifier means: you tell the regex engine how often you want to match a given pattern.

If you think you don’t define any quantifier, you do it implicitly: no quantifier means to match the regular expression exactly once.

So what are the regex quantifiers in Python?

Quantifier Meaning
A? Match regular expression A zero or one times
A* Match regular expression A zero or more times
A+ Match regular expression A one or more times
A Match regular expression A exactly m times
A Match regular expression A between m and n times (included)

Note that in this tutorial, I assume you have at least a remote idea of what regular expressions actually are. If you haven’t, no problem, check out my detailed regex tutorial on this blog.

You see in the table that the quantifiers ? , * , + , , and define how often you repeat the matching of regex A .

Let’s have a look at some examples—one for each quantifier:

In each line, you try a different quantifier on the same text ‘aaaa’ . And, interestingly, each line leads to a different output:

  • The zero-or-one regex ‘a?’ matches four times one ‘a’ . Note that it doesn’t match zero characters if it can avoid doing so.
  • The zero-or-more regex ‘a*’ matches once four ‘a’ s and consumes them. At the end of the string, it can still match the empty string.
  • The one-or-more regex ‘a+’ matches once four ‘a’ s. In contrast to the previous quantifier, it cannot match an empty string.
  • The repeating regex ‘a<3>‘ matches up to three ‘a’ s in a single run. It can do so only once.
  • The repeating regex ‘a<1,2>‘ matches one or two ‘a’ s. It tries to match as many as possible.

You’ve learned the basic quantifiers of Python regular expressions.

Alternative Error Message (Fragments)

You may encounter any of the following fragments that all lead to the multiple repeat error:

  • re.error: multiple repeat at position
  • multiple repeat at position
  • sre_constants.error: multiple repeat
  • python regex multiple repeat
  • python re multiple repeat
  • regex multiple repeat
  • re.error multiple repeat at position

Again, you can fix the multiple repeat error by avoiding to stack two regex quantifiers on top of each other. For example, the regex pattern ‘a++’ will cause the multiple repeat error—use a single quantifier such as ‘a+’ instead.

Where to Go From Here?

To summarize, you’ve learned that the multiple repeat error appears whenever you try to stack multiple quantifiers on top of each other. Avoid this and the error message will disappear.

If you want to boost your Python regex skills to the next level, check out my free in-depth regex superpower tutorial (20,000+) words. Or just bookmark the article for later read.

Regex Humor

While working as a researcher in distributed systems, Dr. Christian Mayer found his love for teaching computer science students.

To help students reach higher levels of Python success, he founded the programming education website Finxter.com. He’s author of the popular programming book Python One-Liners (NoStarch 2020), coauthor of the Coffee Break Python series of self-published books, computer science enthusiast, freelancer, and owner of one of the top 10 largest Python blogs worldwide.

His passions are writing, reading, and coding. But his greatest passion is to serve aspiring coders through Finxter and help them to boost their skills. You can join his free email academy here.

Источник


 
Data visualization

  • Extract phone numbers – v1
  • Extract phone numbers -v2
  • Extract phone numbers -v3
  • Extract phone numbers -v4
  • Extract phone numbers -v5
  • Extract emails
  • Extract zip codes
  • Log file analysis
    • Challenge
  • Credit cards extractor
  • Word occurrences
  • HTML tags
  • Password verification
  • Regular expressions cheatsheet
  • Challenge – Extract data to JSON format

Regular expressions are a very important tool for a data scientist or
a machine learning engineer. Regular expressions is dry and boring
topic to learn. But the problems Regular Expressions solve are very real
and interesting. So, we will learn Regular Expressions with a problem
solving approach. We will define a series of small problems, solve them
step by step and with each problem, we will learn some of the aspects of
Regular Expressions.

If you are not comfortable with this kind of non-linear approach, this course might not be for you.

Problem – Extract all the phone numbers from this text.

numbers = '''There are 3 phone numbers that you need to call in case of medical emergency. 
For casualty, call 408-202-2222. For elderly emergencies, call 408-203-2222 and
for everything else call 408-202-4444
'''


Let’s take a much simpler case – just a list of 3 phone numbers and no text. If the patter is at the beginning of the string, you can use the match ( ) function. Also, match ( ) function only returns the first occurance.

import re  #import for regular expressions

numbers = '''408-202-2222
             408-203-2222
             408-202-4444'''
match = re.match("408-ddd-dddd",numbers)

match ( ) function returns a match object. It contains the span (the start and end of the match) and the actual match itself. Use the group ( ), span( ), start( ) and end( ) functions to get the specifics of the match.

print ( "matching text  = ", match.group())
print ( "start position = ", match.start())
print ( "end position   = ",match.end())
matching text  =  408-202-2222
start position =  0
end position   =  12

Let’s try something slightly different with match ( ) function. Will it be able to pick the pattern from this text ?

numbers = '''
             408-202-2222
             408-203-2222
             408-202-4444'''

match = re.match("408-ddd-dddd",numbers)

No. Why is that ? match ( ) function can only find out the pattern at the beginning of the string. In this case, the first line is a blank line. So, the match ( ) function fails. In cases like this, use the search ( ) function. In contrast to the match ( ) function, the search ( ) function can extract patterns anywhere in the text.

match = re.search("408-ddd-dddd",numbers)

But, we are still getting the first match only. We wanted all the matches, right ? Botht he search ( ) and match ( ) functions return the first match only. To get all the matches, we will have to use other functions like findall ( ) or finditer ( ).

matches = re.findall("408-ddd-dddd",numbers)

matches = re.findall("408-ddd-dddd",numbers)

['408-202-2222', '408-203-2222', '408-202-4444']

That’s much better, right ? The findall ( ) function returns all the matches it finds in the text. The other function finditer ( ) just returns the same results in an iterable.

matches = re.finditer("408-ddd-dddd",numbers)

for match in matches : 
    print ( match )
<re.Match object; span=(14, 26), match='408-202-2222'>
<re.Match object; span=(40, 52), match='408-203-2222'>
<re.Match object; span=(66, 78), match='408-202-4444'>

If you wanted just the match, use the group () function to extract the matching text.

matches = re.finditer("408-ddd-dddd",numbers)

for match in matches : 
    print ( match.group() )

408-202-2222
408-203-2222
408-202-4444


Now, we can solve the problem we started out with.

numbers = '''There are 3 phone numbers that you need to call in case of medical emergency. 
For casualty, call 408-202-2222. For elderly emergencies, call 408-203-2222 and
for everything else call 408-202-4444
'''

matches = re.finditer("408-ddd-dddd",numbers)

for match in matches : 
    print ( match.group() )

408-202-2222
408-203-2222
408-202-4444


In fact, even if the starting phone number is not always constant, like a 408 in this case, still we should be able to extract the matches.

numbers = '''There are 3 phone numbers that you need to call in case of medical emergency. 
For casualty, call 408-202-2222. For elderly emergencies, call 408-203-2222 and
for everything else call 800-202-4444
'''
matches = re.finditer("ddd-ddd-dddd",numbers)

for match in matches : 
    print ( match.group() )
408-202-2222
408-203-2222
800-202-4444

See, all the numbers have been extracted.

Points to Remember

  • d represents a single digit
  • match ( ) function returns the first match only, but only start at the beginning of the line.
  • search ( ) function returns the first match only.
  • findall ( ) and finditer ( ) functions return all the matches.

Problem – Extract all the phone numbers from this text message.

numbers = '''408-222-2222,
             (408)-333-3333,
             (800)-444-4444'''


Let’s try what we know so far.

match = re.findall("ddd-ddd-dddd",numbers)

But this only matches the phone numbers without brackets. What about the ones with paranthesis ? We can try something like this.

match = re.findall("(ddd)-ddd-dddd",numbers)
print ( match )


oops.. it is not working. Why ? Because, paranthesis represents a special character – It is used to make groups out of regular expressions (which, we will see later). To represent an actual paranthesis, escape it with a backslash.

match = re.findall("(ddd)-ddd-dddd",numbers)
print ( match )
['(408)-333-3333', '(800)-444-4444']

OK. Now, we got the phone numbers with paranthesis, but we missed the ones without paranthesis. We want to capture either of these combinations. That’s when we use the OR operator. In regular expressions, we use the pipe operator (|) to represent either/or type of patterns.

match = re.findall("(ddd)-ddd-dddd|ddd-ddd-dddd",numbers)
print ( match )
['408-222-2222', '(408)-333-3333', '(800)-444-4444']

There we go – we were able to capture both the patterns. However, the d in the pattern repeats a lot making the string too long. Instead, we can use quantifiers to specify how long a particular sub-pattern can be. For example, the following pattern is exactly equivalent to the pattern above.

match = re.findall("(d{3})-d{3}-d{4}|d{3}-d{3}-d{3}",numbers)
print ( match )

['408-222-222', '(408)-333-3333', '(800)-444-4444']

As you can see, quantifiers make the pattern much more compact in case there is a lot of repetition.

Points to Remember

  • If paranthesis (or ) needs to be used in the pattern, escape them with a backslash ( ). This is done because, paranthesis is used to represent groups, which we will look into later.
  • | or pipe character is used to represent a logical OR operator in regular expressions.
  • { } Flower brackets are used to quantify the number of occurrances of a particular part of a regular expression. For example, a{3} is used to indicate that exactly 3 a‘s should be looked for.

Problem – Extract all the phone numbers from this text message.

numbers = '''408-222-2222,
             408.333.3333,
             800 444 4444'''

match = re.findall("d{3}-d{3}-d{4}|d{3}.d{3}.d{3}|d{3}d{3}d{3}",numbers)
print ( match )

['408-222-2222', '408.333.333', '800 444 444']


This works. But, can we make it any more concise ? There seems to be a lot of repetition. This is where character sets come in. In this case, the separator between the phone numbers is either a dash or a dot or a blank space. Can we somehow represent all of these characters to be searched for as separators, as opposed to specifying each pattern separately ?

match = re.findall("d{3}[-.]d{3}[-.]d{4}",numbers)
print ( match )
['408-222-2222', '408.333.3333']

But what about phone numbers with spaces ? How do we represent a space in regular expressions ? We use the special character s.

match = re.findall("d{3}[-.s]d{3}[-.s]d{4}",numbers)
print ( match )

['408-222-2222', '408.333.3333', '800 444 4444']

There we go – we are able to capture all of the phone numbers.

Points to Remember

  • Characters enclosed in [] (square brackets) are called character sets. Regular expressions search for any character inside the charater set for matches.
  • s is used to represent a space or blank character.

Problem – Extract all the phone numbers from this text.

numbers = '''   408-222-2222,
             1 408.333.3333,
             1 408-444-4444,
             1 (800) 444 4444'''

match = re.findall("d{3}[-.s]d{3}[-.s]d{4}",numbers)
print ( match )
['408-222-2222', '408.333.3333', '408-444-4444']

But, how about the 1 before the numbers ? How do we capture them ? Some phone numbers have it and some don’t. That’s where the ? quantifier comes in. If a pattern needs to be checked for occurance zero or 1 time, use the ? quantifier.

match = re.findall("1?sd{3}[-.s]d{3}[-.s]d{4}",numbers)
print ( match )
[' 408-222-2222', '1 408.333.3333', '1 408-444-4444']

Much better. Now, what about the 800 number with paranthesis ? How do we look for paranthesis ? We have seen previously that paranthesis is a special character and to extract that we need to escape it. Let’s try that.

match = re.findall("1?s(?d{3})?[s]d{3}[s]d{3}",numbers)
print ( match )

Alright, we got that as well. Now, to combine all of these, we can use the OR operator.

match = re.findall("1?s(?d{3})?[s]d{3}[s]d{3}|1?sd{3}[-.s]d{3}[-.s]d{4}",numbers)
print ( match )
[' 408-222-2222', '1 408.333.3333', '1 408-444-4444', '1 (800) 444 444']

Or, we can combine them like this.

match = re.findall("1?s(?d{3})?[-.s]d{3}[-.s]d{3}",numbers)
print ( match )
[' 408-222-222', '1 408.333.333', '1 408-444-444', '1 (800) 444 444']

Learning

  • ? is used to represent a pattern that repeats zero or one time. It is a type of quantifier like {n}

Problem – Extract all the phone numbers from this text.

numbers = '''+1  408-222-2222,    
             +91 98989-99898,
             +86 10-1234-5678,
             +263 10-234-5678'''

The first one is a US phone number, the second one is India and the third one is Chinese number. How to extract these. Let’s start with the plus (+) at the beginning of the string. How to extract that ?

match = re.findall("+",numbers)
---------------------------------------------------------------------------
error                                     Traceback (most recent call last)
<ipython-input-164-2fe5ce5c5168> in <module>
----> 1 match = re.findall("+",numbers)

c:program filespython37libre.py in findall(pattern, string, flags)
    221 
    222     Empty matches are included in the result."""
--> 223     return _compile(pattern, flags).findall(string)
    224 
    225 def finditer(pattern, string, flags=0):

c:program filespython37libre.py in _compile(pattern, flags)
    284     if not sre_compile.isstring(pattern):
    285         raise TypeError("first argument must be string or compiled pattern")
--> 286     p = sre_compile.compile(pattern, flags)
    287     if not (flags & DEBUG):
    288         if len(_cache) >= _MAXCACHE:

c:program filespython37libsre_compile.py in compile(p, flags)
    762     if isstring(p):
    763         pattern = p
--> 764         p = sre_parse.parse(p, flags)
    765     else:
    766         pattern = None

c:program filespython37libsre_parse.py in parse(str, flags, pattern)
    928 
    929     try:
--> 930         p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, 0)
    931     except Verbose:
    932         # the VERBOSE flag was switched on inside the pattern.  to be

c:program filespython37libsre_parse.py in _parse_sub(source, state, verbose, nested)
    424     while True:
    425         itemsappend(_parse(source, state, verbose, nested + 1,
--> 426                            not nested and not items))
    427         if not sourcematch("|"):
    428             break

c:program filespython37libsre_parse.py in _parse(source, state, verbose, nested, first)
    649             if not item or item[0][0] is AT:
    650                 raise source.error("nothing to repeat",
--> 651                                    source.tell() - here + len(this))
    652             if item[0][0] in _REPEATCODES:
    653                 raise source.error("multiple repeat",

error: nothing to repeat at position 0

oops.. doesn’t work. That is because, + is a special character. It is used to represent a quantifer. + means that a patter repeats one or more time. So, to find + as a pattern, you would have to escape it.

match = re.findall("+", numbers)
print ( match )

OK, Now, we are able to get the + in the string. Let’s extract the country code next. It is the set of numbers right next to the +
symbol. It could be 1 (like US ) or 2 (like India, China ), or may be 3
(Zimbabwe is +263 ). We can use the flower brackets to specify a
pattern length of 1 to 3 like so –

{1,3}

match = re.findall("+d{1,3}", numbers)
print ( match )


Next, we have a set of numbers separated by dashes. However, the count of numbers between the dashes is arbitrary. So, we need some kind of a quantifier again to find out repetitive pattern of count between 1 and n. We could just assume a higher nuber say 5 for n and proceed like so.

match = re.findall("+d{1,3}s{1,3}d{1,6}-d{1,6}-d{1,6}", numbers)
print ( match )

Instead of using the {m,n} quantifier to identify digits that repeat atleast once, you can use the quantifier +.

match = re.findall("+d+s+d+-d+-d+", numbers)
print ( match )


We are still missing another number , +91 98989-99898. This is because, the number is divided into 2 parts (and not 3 parts separated by dashes). So, a simple solution would be to create another pattern and do an OR operation. That should capture all of the possible phone numbers in this case.

match = re.findall("+d+s+d+-d+-d+|+d+s+d+-d+", numbers)
print ( match )

Learning

  • {m,n} is used to represent a pattern that repeats m to n number of times. It is a type of quantifier.
  • Since + is a special character (used to identify patterns that repeat 1 or more times), to identify + itself, escape it with a backslash ()

Problem – Extract all the emails from this text.

text = '''   accounts@boa.com,    
             sales@boa.com,
             cancellations@tesla.com,
             accounts@delloitte.com,
             cancellations@farmers.com,
             accounts@dell@com'''

To solve text based patterns, one of the fundamental character set is w. It represents any character that can be found in a word – it could be alphabetic or numeric or underscore. These are the only 3 types of characters that w can find. For example, a single w on this text, basically captures all the word characters (a to z characters, 0-9 digits and underscore ). You can see that in the output below.

matches = re.findall("w",text)
print ( matches )

We need to step up from letters, to identify words. A word is just a repetition of a set of letters, numbers and underscores. So, we use a quantifier + to identify a word.

matches = re.findall("w+",text)
print ( matches )

Now that we have all the words, all we have to do is to put together the pattern that includes the @ symbol and dot.

matches = re.findall("w+@w+.w+",text)
print ( matches )


We are almost there, except the last email – accounts@dell@com. This is not a valid email. So, why is our pattern capturing it ? When we mentioned dot (.) in our pattern (w+@w+.w+), it basically captures any character. So, in order to capture a dot, all we have to do is to escape it – prepend it with a backslash ()

matches = re.findall("w+@w+.w+",text)
print ( matches )

There you go, we have succesfully found out all the emails in the text.

Learning

  • w is used to represent a character in a word – it could be an alphabet (a-z) or a number ( 0-9) or an underscore.
  • w+ – to identify words, all you have to do is append w with a plus (+).
  • . (dot) is used to identify ANY character. It is a special character. To actually identify a . (dot) itself, just escape it with a backslash ().

Problem : Say we have a text with US zip codes. The valid format for US zip codes are

  • 99999
  • 99999-9999

where 9 represents any digit. Write a regular expression to extract all zip codes from the text.

text = '''08820, 08820-1245, zip code, 98487, 98487-0000, ABCD  '''

matches = re.findall ( "d{5}-d{4}|d{5}", text)
print ( matches )
html = '''<font size=2>
          <font  size=2>
          <font size = 2>
          < font  size=2 >
          <font     size    =   2   >'''

Quiz
Which of the following regular expression captures all of the above combinations. Observe the spaces precisely.

  • “<s+font

Exercise : Say we have a text with Canadian zip codes. The format for canadian zip codes is

  • A1A A1A

where A represents an alphabet and 1 represents any digit. There is a space at the 4th character.

text = '''M1R 0E9
          M3C 0C1
          M3C 0C2
          M3C 0C3
          M3C 0E3
          M3C 0E4
          M3C 0H9
          M3C 0J1
          1M1 A1A
          11M 1A1
          M11 A1A
          M3C0J1
          M3C JJ1'''

# Test - The last five elements should NOT match

Solution

matches = re.findall ("[A-Z]d[A-Z] d[A-Z]d", text)
print ( matches )

Log file analysis

Problem – Say there is a web server log file, find out how many times the login file was succesfully hit and how many times it failed. For now, we will work with a sample snippet from the file. We will work with the real file in the next challenge.

log = '''
10.128.2.1 [29/Nov/2017:06:58:55 GET /login.php HTTP/1.1 Status Code  - 302
10.128.2.1 [29/Nov/2017:06:59:02 POST /process.php HTTP/1.1 Status Code  - 302
10.128.2.1 [29/Nov/2017:06:59:03 GET /home.php HTTP/1.1 Status Code  - 200
10.131.2.1 [29/Nov/2017:06:59:04 GET /js/vendor/moment.min.js HTTP/1.1 Status Code  - 200
10.130.2.1 [29/Nov/2017:06:59:06 GET /bootstrap-3.3.7/js/bootstrap.js HTTP/1.1 Status Code  - 200
10.130.2.1 [29/Nov/2017:06:59:19 GET /profile.php?user=bala HTTP/1.1 Status Code  - 200
10.128.2.1 [29/Nov/2017:06:59:19 GET /js/jquery.min.js HTTP/1.1 Status Code  - 200
10.131.2.1 [29/Nov/2017:06:59:19 GET /js/chart.min.js HTTP/1.1 Status Code  - 200
10.131.2.1 [29/Nov/2017:06:59:30 GET /edit.php?name=bala HTTP/1.1 Status Code  - 200
10.131.2.1 [29/Nov/2017:06:59:37 GET /logout.php HTTP/1.1 Status Code  - 302
10.131.2.1 [29/Nov/2017:06:59:37 GET /login.php HTTP/1.1 Status Code  - 200
10.130.2.1 [29/Nov/2017:07:00:19 GET /login.php HTTP/1.1 Status Code  - 200
10.130.2.1 [29/Nov/2017:07:00:21 GET /login.php HTTP/1.1 Status Code  - 200
10.130.2.1 [29/Nov/2017:13:31:27 GET / HTTP/1.1 Status Code  - 302
10.130.2.1 [29/Nov/2017:13:31:28 GET /login.php HTTP/1.1 Status Code  - 200
10.129.2.1 [29/Nov/2017:13:38:03 POST /process.php HTTP/1.1 Status Code  - 302
10.131.0.1 [29/Nov/2017:13:38:04 GET /home.php HTTP/1.1 Status Code  - 200'''

solution

pattern = "(d+.d+.d+.d+).*(login.php)s(HTTP).*-s(d{3})"

matches = re.findall (pattern, log)
print (matches)
count_200     = 0
count_not_200 = 0

for match in matches : 
    if match[3] == "200" :
        count_200 += 1
    else : 
        count_not_200 += 1
        
success_perc = ( count_200 / (count_200 + count_not_200) ) * 100
print ( " login was succesfully hit ", success_perc , "% of time")

Learning

  • (…) is used to represent groups in a regular expression
  • There can be multiple groups in a single regular expression
  • Each of the groups can be extracted out per each match of the regular expression
  • . (dot) represents ANY character

Challenge

Say there is a web server log file, find out how many times the login file was succesfully hit and how many times it failed. The file is available in the data directory. If the HTTP code ( at the end of each line in the log file ) is 200 the page is succesfully rendered. Otherwise, it is a failure.

Solution

# read file

data = [] # will contain the log data as a list
with open ( "./data/log_file.txt", "r") as f : 
    for line in f : 
        data.append(line)
        
# print the read data
for line in data [0:5]: 
    print ( line, end="")
# parse the data using regular expression and find matches for login.php
import re

login_data = []

pattern = "(d+.d+.d+.d+).*(login.php)s(HTTP).*-s(d{3})"
for line in data : 
    matches = re.findall (pattern, line)
    if len(matches) > 0 : 
        login = []
        login.append(matches[0][0])
        login.append(matches[0][1])
        login.append(matches[0][2])
        login.append(matches[0][3])        
        
        login_data.append(login)
        
# print a sample 
for line in login_data[0:5]:
    print ( line)
# calculate the success ratio
count_200 = 0       # succesful
count_not_200 = 0   # unsuccesful

for element in login_data : 
    if element[3] == "200": 
        count_200 += 1
    else : 
        count_not_200 += 1

percentage_success = ( count_200 / (count_200 + count_not_200) ) * 100
print ( "Login page was succesfully hit ", percentage_success, "% of the time")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-192-2b3d022c0101> in <module>
      3 count_not_200 = 0   # unsuccesful
      4 
----> 5 for element in login_data :
      6     if element[3] == "200":
      7         count_200 += 1

NameError: name 'login_data' is not defined
text = '''Data science is an inter-disciplinary field  that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data. Data science is related to data mining and big data.

Data science is a "concept to unify statistics, data analysis, machine learning and their related methods" in order to "understand and analyze actual phenomena" with data. It employs techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, and information science. Turing award winner Jim Gray imagined data science as a "fourth paradigm" of science (empirical, theoretical, computational and now data-driven) and asserted that "everything about science is changing because of the impact of information technology" and the data deluge. In 2015, the American Statistical Association identified database management, statistics and machine learning, and distributed and parallel systems as the three emerging foundational professional communities.'''
pattern = re.findall(".{10}\s{2,}.{10}",text)
print ( pattern )

Quiz – The pattern above can be used to find out

  • words that have 2 or more spaces in between them
  • sentences that have 2 or more space in between them

Problem – Find credit card numbers identified by
their category (Visa, Master Card ). These credit card numbers follow a
certain pattern. Use the following pattern to identify the category.

Patterns

  • All visa card numbers start with a 4 and are either 13 or 16 numbers
  • All Master card numbers start with 51 through 55 or 2221 through 2720 and have exactly 16 digits
  • All Amex cards start with 34 or 37 and have exactly 15 digits
text = '''4018-2345-0303-0339,
          5335-6092-0182-9739,
          4076-2929-0000-2222,
          3777-5074-1547-439,
          5451-3970-1507-5040,
          3425-2515-2514-202,
          3752-2681-2429-924,
          4004-4759-3761-924,
          2228-2545-5555-2542,
          2296-2542-2587-2555,
          2321-2547-5145-2222,
          2650-2545-2222-5555,
          2706-2546-2589-2515,
          2713-9874-5263-6253,
          2720-2541-3256-6985
'''


Solution

Let’s see if the following solution works for Visa cards.

visa_matches = re.findall("4[0-9]{3}-[0-9]{4}-[0-9]{4}-[0-9][0-9]{3}?", text)
print ( visa_matches )


The last card is not being picked up. Let’s wrap the last 3 digits in a group and try it.

visa_matches = re.findall("4[0-9]{3}-[0-9]{4}-[0-9]{4}-[0-9]([0-9]{3})?", text)
print ( visa_matches )


ooh.. this time it only picks up the group. But we wanted the entire number, right ? There are a couple of options.

  • We can either put all of the remaining pattern also into groups.. like so
visa_matches = re.findall("(4[0-9]{3}-[0-9]{4}-[0-9]{4}-[0-9])([0-9]{3})?", text)
print ( visa_matches )

  • or, we can let the last element in the pattern to not be a non-capturing group – meaning, it will still be a group from a syntax perspective, but will not be captured as a group. To do that, we use ?:.
visa_matches = re.findall("4[0-9]{3}-[0-9]{4}-[0-9]{4}-[0-9](?:[0-9]{3})?", text)
print ( visa_matches )

Whenever we use ?: at the beginning of the group, it
will be used to capture the pattern, but will not be captured into the
group. Now, let’s work on master card.

Master card represents a different pattern. It has a pretty broad range of numbers – The beginning numbers start with

  • 51 through 55 OR
  • 2221 through 2720

The first one is easy enough. Let’s work on that first.

mc_matches = re.findall("5[1-5][0-9]{2}-[0-9]{4}-[0-9]{4}-[0-9]{4}", text)
print ( mc_matches )


The range 2221-2720 cannot be specified that easily. We need a different strategy for that. We can split this range as follows.

  • 2221-2229
  • 223x-229x ( 2230 to 2299 )
  • 23xx-26xx ( 2300 to 2699 )
  • 270x-2709 ( 2700 to 2709 )
  • 271x-2719 ( 2710 to 2719 )
  • 2720

We need to code all these patters using an OR operator.

mc_matches = re.findall("222[1-9]-[0-9]{4}-[0-9]{4}-[0-9]{4}",text)
print(mc_matches)
mc_matches = re.findall("22[3-9][0-9]-[0-9]{4}-[0-9]{4}-[0-9]{4}",text)
print(mc_matches)

mc_matches = re.findall("2[3-6][0-9]{2}-[0-9]{4}-[0-9]{4}-[0-9]{4}",text)
print(mc_matches)
mc_matches = re.findall("270[0-9]-[0-9]{4}-[0-9]{4}-[0-9]{4}",text)
print(mc_matches)
mc_matches = re.findall("271[0-9]-[0-9]{4}-[0-9]{4}-[0-9]{4}",text)
print(mc_matches)

mc_matches = re.findall("2720-[0-9]{4}-[0-9]{4}-[0-9]{4}",text)
print(mc_matches)

In all these examples, the first four digits are the ones that are different. The pattern for the rest of the 12 numbers remain the same. So, let’s compress all of these into an OR based patter for the first 4 digits and let the remaining 12 digits remain constant.

mc_matches = re.findall ("(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|270[0-9]|271[0-9]|2720)-[0-9]{4}-[0-9]{4}-[0-9]{4}", text)
print ( mc_matches)


There we go – that should cover all the possible combinations of
Master cards. Now, lets move on to Amex. Amex has a really simple
pattern –

  • All Amex cards start with 34 or 37 and have exactly 13 digits
amex_matches = re.findall("3[47][0-9]{2}-[0-9]{4}-[0-9]{4}-[0-9]{3}", text)
print ( amex_matches )

Learning

  • (?:…) is used to represent non-capturing
    groups – meaning, they will be used to identify patterns, but the
    specific pattern within that paranthesis will not be captured as a
    group. We have seen how this will be useful in case of Visa pattern.
  • Cycling through a range of numbers. We have seen how to cycle
    through a large range of numbers when we discussed the pattern for
    Master card.

Word occurrences

Problem – Find all the occurances of a word in a text and segregate them into 2 categories

  • 1 – standalone occurrence of the word
  • 2 – The word is part of another word.

For example, the word “bat” can occur in isolation , like in the
sentence (“His cricket bat is awesome”), or as part of a different word ,
like in (“Aeriel combat vs land-based combat”).

Solution – Finding the pattern is quite easy. However, the trick is to find out if the word occurred individually or is part of another word. word boundaries can help in this case. b is used to specify a word boundary.

text = '''Python is a general purpose programming language. Python's design philosopy ...
          Let's pythonify some of the code...
          while python is a high level language...'''
matches = re.findall("bPythonb", text)
print ( matches )


But it is not working. Why ? That is because b is a special escape character for backspace. When you specify that in the pattern string, it is not treated literally, but interpreted as a backspace. To avoid confusion, always use raw strings to define patterns. Raw strings can be specified in Python by prepending the string with a r. Let’s try this again.

matches = re.findall(r"bPythonb", text)
print ( matches )


That’s better. However, there are 3 occurrences of “Python”. Why is python in the last line not being picked up ? That is because, regular expressions are case sensitive. The “p” in the word “python” in the last line is lower case. If you wanted to do a case sensitive search, use global flags. These can be specified as a third parameter in the findall ( ) function.

matches = re.findall(r"bPythonb", text, re.IGNORECASE) # you can also use re.I as a shortcut
print ( matches )

['Python', 'Python', 'python']

Learning

  • b is used to specify a word boundary.
  • Always prepend the pattern string with r to make it a raw string. This way, the characters in the pattern are taken literally.
  • global flags can be used to alter the way regular expressions work. One of the global flags we have seen is re.IGNORECASE. It can be used to do a case insensitive search.

HTML tags

Problem – Find all the tags in a HTML or XML.

For example, here is a small snippet of HTML. There are many tags like , , etc. We have to identify all the tags used in the following HTML.

import re 
text = '''<html>
            <head> 
            <title> What's in a title</title
            </head>
            <body>
                <tr>
                    <td>text one </td>
                    <td>text two </td>
                </tr>
            </body>
          </html>
'''

Solution

matches = re.findall(r"<([^>]*)>", text)
print ( matches )
['html', 'head', 'title', '/titlen            </head', 'body', 'tr', 'td', '/td', 'td', '/td', '/tr', '/body', '/html']

This gives the start and end tags. Say, we don’t want the end tags.

matches = re.findall(r"<([^>/]*)>", text)
print ( matches )
['html', 'head', 'title', 'body', 'tr', 'td', 'td']

Another way to do it is to use non-greedy quantification. When you start a pattern with < and consume any character with ., it consumes it all the way to the end. That is why and + are greedy quantifiers. To negate the effect of it, use the ? operator. That way it allows the * to match the least amount of text before the regular expression is satisfied.

import re

matches = re.findall("<.*?>", text)
print ( matches )
['<html>', '<head>', '<title>', '</head>', '<body>', '<tr>', '<td>', '</td>', '<td>', '</td>', '</tr>', '</body>', '</html>']

Learning

  • and * are greedy quantifier. They consume the most amount of text before a pattern can be satisfied.

title

Problem – as

text = '''
ages = [22, 23, 24]
9_age = [22, 23, 24]
'''

matches = re.findall("[a-zA-Z_][a-zA-Z_0-9]{1,20}", text)
print ( matches)
matches = re.findall("[0-9][a-zA-Z_0-9]{1,20}", text)
print ( matches)
['22', '23', '24', '9_age', '22', '23', '24']

Password verification

Problem – Verify if a password is

  • Has atleast One upper case character
  • Has atleast one digit.
  • Has atleast one special character ( let’s limit special characters to @ , # , $ , % )
text = '''Aw@som$passw0rd
          Awesomepassw0rd
          Awesomepassword
          Aw!som!passw0rd
          aw!som!passw0rd'''

# All combinations except for the first one is valid


Solution

This can be solved easily using regular python lists. However, we wanted a more concise solution using regular expressions. In these kinds of situations, we are looking for some kind of validation. Regular expression’s lookaround function is very useful in these cases. The syntax for that is (?=…) where … represents any regular expression. Let’s start with the first condition

matches = re.findall("[^A-Z]*[A-Z].*", text)
print(matches)
matches = re.findall("[^A-Z]*[A-Z]D*d.*", text)
print(matches)


This works too. Now, lets try a different combimation – put the digits before the letters.

matches = re.findall("[^A-Z]*[A-Z]D*d.*", text)
print(matches)

That failed. why ? Because regular expressions consume text and move forward. So, the expession [^A-Z]*[A-Z] consumed all the text including the 1 at the beginning. And it is now looking for a number at d, which it cannot find after the capital letter. This is where lookarounds help.

matches = re.findall("(?=[^A-Z]*[A-Z])(?=D*d).*", text)
print(matches)

This time it works. The reason is that we have converted the digit search D*d into a lookahead(?=Dd).

An important aspect of lookarounds ( look ahead or look behind ) is that it does not consume any characters. For example, look at the example below. We want to find out all the words that are preceded by a comma, but we don’t want to show the comma.

text = "Hi there, how are you doing ?"

# b for word boundary
# w+ for a word
#(?=,) will ensure that the word is followed by a comma

matches = re.findall(r"bw+(?=,)", text)
print ( matches )

See, the comman is not shown in the output. Granted it is not a big deal. We can do that using groups. However, there are many situations (like the password example above) that cannot be achieved using groups. That’s where lookarounds come in. Let’s continue the same example as above and find out all the words, preceded by a comma.

# (?=,s) => verify (assert) that before the word, there is a comma followed by a space
# w+ is a word

matches = re.findall(r"(?<=,s)w+", text)
print ( matches )


Learning

  • There are 2 type of Lookaroundslook ahead and look behind.
  • (?=…) is used to do look ahead search.
  • Lookarounds are also called assertions

Regular expressions cheatsheet

Special Character Description
. Matches any character – except new line
[XYZ] Character set
[^XYZ] Negation of the Character set
[A-Z] Matches any character – except new line
pipe Logical OR
. Matches any character – except new line
w Matches any word character. Equivalent to [A-Za-z0-9_]
W Negation of any word character. Equivalent to [^A-Za-z0-9_]
d Matches any digit. Equivalent to [0-9]
D Matches any non-digit. Equivalent to [^0-9]
s Matches any whitespace character ( spaces, tabs or linebreaks )
S Matches any non-whitespace character
^ Matches beginning of line
$ Matches end of line
b Word boundary
B not a word boundary
* Zero or more
+ One or more
? Zero or one
(XYZ) Capturing group
(?:XYZ) non-capturing group
(?=XYZ) Positive lookahead
(?!XYZ) Negative lookahead
(?<=XYZ) Positive lookbehind
(?<!XYZ) Negative lookbehind

Challenge

Say, we gave a bunch of cities along with their nick names in the following format in a text file. Extract the city and it nick name in a JSON format with the structure as follows.

cities = '''
1. Paris – The City of Love, The City of Light, La Ville-Lumiere

2. Prague – The City of Hundred Spires, The Golden City, The Mother of Cities

3. New York – The Big Apple

4. Las Vegas – Sin City'''

# required output format
{
    "city_1" : ["nick name 1", "nick name 2"], 
    "city_2" : ["nick name 1", "nick name 2.."]
}

#import the file

with open("./data/cities.txt","r") as f : 
    data = f.read()

import re 

matches = re.findall("d+.s+(w+s?w+)s+–s+(.*)", data)
print ( matches[0:5])
import json

city_dict = {}

for city in matches : 
    city_dict[city[0]] = city[1].split(",")
    
city_json = json.dumps(city_dict)

Есть текст:

['Розенфельд А.С.']
#920: ASP
#900: ^B08
#200: ^AПрофессионально-прикладная физическая подготовка студентов аграрного вуза
#102: RU
#1107: ^F0^J0^R0^S0^W0^Z0
#463: ^CАграрное образование и наука^J2018^GФедеральное государственное бюджетное образовательное учреждение высшего образования "Уральский государственный аграрный университет"^DЕкатеринбург^S3-3^HN 4
#1105: Статья в журнале
#963: ^I2309-7671
*****
['Цихалевский И.С.', 'Буйносов А.П.']
#920: ASP
#900: ^B08
#200: ^AОрганизация эксплуатации, обслуживания и ремонта газотурбовозов гт1h
#102: RU
!!!!!!!!!!!!!!!!!!!!!!!!!
#1107: ^F1^J0^R0^S0^W0^Z0
#463: ^CВестник Уральского государственного университета путей сообщения^J2018^GФедеральное государственное бюджетное образовательное учреждение высшего образования "Уральский государственный университет путей сообщения"^DЕкатеринбург^S43-55^HN 3 (39)
#1105: Статья в журнале
#963: ^I2079-0392
*****
['Некрасов К.В.']
#920: ASP
#900: ^B08
#200: ^AИнновационная деятельность организаций отрасли
#102: RU
#1107: ^F0^J0^R0^S0^W0^Z0
#463: ^CАграрное образование и наука^J2018^GФедеральное государственное бюджетное образовательное учреждение высшего образования "Уральский государственный аграрный университет"^DЕкатеринбург^S28-28^HN 4
#1105: Статья в журнале
#963: ^I2309-7671
*****

Нужно удалить часть текста с «!!!!!!!!!!»(восклицательными знаками). Желаемый результат:

['Розенфельд А.С.']
#920: ASP
#900: ^B08
#200: ^AПрофессионально-прикладная физическая подготовка студентов аграрного вуза
#102: RU
#1107: ^F0^J0^R0^S0^W0^Z0
#463: ^CАграрное образование и наука^J2018^GФедеральное государственное бюджетное образовательное учреждение высшего образования "Уральский государственный аграрный университет"^DЕкатеринбург^S3-3^HN 4
#1105: Статья в журнале
#963: ^I2309-7671
*****
['Некрасов К.В.']
#920: ASP
#900: ^B08
#200: ^AИнновационная деятельность организаций отрасли
#102: RU
#1107: ^F0^J0^R0^S0^W0^Z0
#463: ^CАграрное образование и наука^J2018^GФедеральное государственное бюджетное образовательное учреждение высшего образования "Уральский государственный аграрный университет"^DЕкатеринбург^S28-28^HN 4
#1105: Статья в журнале
#963: ^I2309-7671
****

Попыталась сделать при помощи регулярного выражения:

import chardet
import re

with open('АРМ.txt', 'rb') as file:
    file_read = file.read()
    result = chardet.detect(file_read)
    f = file_read.decode(result['encoding'])
    print(f)

items = re.sub('*****[^>]+*****', '', f)
print(items)

Но выходит ошибка:

re.error: nothing to repeat at position 0

Подскажите, пожалуйста, как можно удалить часть текста? Заранее благодарна за ответ.

Я пытаюсь разобрать текст, в котором мое регулярное выражение не работает с ошибкой, упомянутой в теме.

В приведенном ниже коде я использую только эту часть, я не знаю, почему она не работает! любая помощь будет оценена

Хотя я видел эту ошибку в темах SO, но мне это кажется другим.

import re
t = '++cnt;'

re.sub('++cnt','@'.join(c for c in '++cnt'),t)

error                                     Traceback (most recent call last)
<ipython-input-482-2b724235a79b> in <module>
      1 t = '++cnt;'
      2 
----> 3 re.sub('+cnt','@'.join(c for c in '++cnt'),t)

~/anaconda3/lib/python3.8/re.py in sub(pattern, repl, string, count, flags)
    208     a callable, it's passed the Match object and must return
    209     a replacement string to be used."""
--> 210     return _compile(pattern, flags).sub(repl, string, count)
    211 
    212 def subn(pattern, repl, string, count=0, flags=0):

~/anaconda3/lib/python3.8/re.py in _compile(pattern, flags)
    302     if not sre_compile.isstring(pattern):
    303         raise TypeError("first argument must be string or compiled pattern")
--> 304     p = sre_compile.compile(pattern, flags)
    305     if not (flags & DEBUG):
    306         if len(_cache) >= _MAXCACHE:

~/anaconda3/lib/python3.8/sre_compile.py in compile(p, flags)
    762     if isstring(p):
    763         pattern = p
--> 764         p = sre_parse.parse(p, flags)
    765     else:
    766         pattern = None

~/anaconda3/lib/python3.8/sre_parse.py in parse(str, flags, state)
    946 
    947     try:
--> 948         p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
    949     except Verbose:
    950         # the VERBOSE flag was switched on inside the pattern.  to be

~/anaconda3/lib/python3.8/sre_parse.py in _parse_sub(source, state, verbose, nested)
    441     start = source.tell()
    442     while True:
--> 443         itemsappend(_parse(source, state, verbose, nested + 1,
    444                            not nested and not items))
    445         if not sourcematch("|"):

~/anaconda3/lib/python3.8/sre_parse.py in _parse(source, state, verbose, nested, first)
    666                 item = None
    667             if not item or item[0][0] is AT:
--> 668                 raise source.error("nothing to repeat",
    669                                    source.tell() - here + len(this))
    670             if item[0][0] in _REPEATCODES:

error: nothing to repeat at position 0


Понравилась статья? Поделить с друзьями:
  • Re error multiple repeat at position 10
  • Re error invalid group reference 2 at position 1
  • Re ctr stick error
  • Rdy ошибка частотника schneider
  • Rdworksv8 не видит станок пишет ошибка соединения