#Downloads addresses from:
#http://www.summet.com/dmsi/html/codesamples/addresses.html
import urllib.request
from re import findall
url = "http://www.summet.com/dmsi/html/codesamples/addresses.html"
response = urllib.request.urlopen(url)
html = response.read()
htmlStr = str(html)
#Previous work that builds up to the final solution below:
#Find all phone numbers. Note how we have to escape the
#curved brackets(parenthesis) with slashes, as they have a
#special RegEx meaning.
#pdata = findall("\(\d{3}\) \d{3}-\d{4}", htmlStr)
#for item in pdata:
# print(item)
#Find all the names. Use the
Firstname Lastname
pattern.
#Note how we use curved brackets to isolate and pull out just the
#name data, leaving the bracketing HMTL tags and
alone.
#ndata = findall("([A-Za-z]+ [A-Za-z]+)
", htmlStr)
#Example which captures the first and last name separately:
#ndata = findall("([A-Za-z]+) ([A-Za-z]+)
", htmlStr)
#for item in ndata:
# print(item)
# We could try to do the following but it would match only the first name
#and the last phone number!
# (\S+ \S+)
.*
(\(\d{3}\) \d{3}-\d{4})
#Note that on gskinner.com/RegExr the above works, but it doesn't work
#in python!
#data = findall("(\S+ \S+)
.*
(\(\d{3}\) \d{3}-\d{4})", htmlStr)
#Now, find the names, and keep them linked to the phone numbers.
#We have to "match" two interveaning lines of address/city,state/zip
#but NOT capture those lines. We use a non-capturing group to find
#one or more line.
# NOTE: We have to use [^<]* to match everything up until the
# tags if we want to use exact numbers...if we use .* it will match many of them!
data = findall("(\S+ \S+)
[^<]*
[^<]*
(\(\d{3}\) \d{3}-\d{4})", htmlStr)
for item in data:
print(item)
print("Done!")