#Downloads addresses from:
#http://www.summet.com/dmsi/html/codesamples/addresses.html


import urllib.request
from re import findall
url = "http://www.summet.com/dmsi/html/codesamples/addresses.html"


response = urllib.request.urlopen(url)
html = response.read()
htmlStr = str(html)

#Previous work that builds up to the final solution below:
#Find all phone numbers. Note how we have to escape the
#curved brackets(parenthesis) with slashes, as they have a
#special RegEx meaning.

#pdata = findall("\(\d{3}\) \d{3}-\d{4}", htmlStr)

#for item in pdata:
#    print(item)

#Find all the names. Use the <li>Firstname Lastname<br/> pattern.
#Note how we use curved brackets to isolate and pull out just the
#name data, leaving the bracketing HMTL tags <li> and <br/> alone.

#ndata = findall("<li>([A-Za-z]+ [A-Za-z]+)<br/>", htmlStr)


#Example which captures the first and last name separately:
#ndata = findall("<li>([A-Za-z]+) ([A-Za-z]+)<br/>", htmlStr)

#for item in ndata:
#    print(item)

# We could try to do the following but it would match only the first name
#and the last phone number!
# <li>(\S+ \S+)<br/>.*<br/>(\(\d{3}\) \d{3}-\d{4})</li>
#Note that on gskinner.com/RegExr the above works, but it doesn't work
#in python!
#data = findall("<li>(\S+ \S+)<br/>.*<br/>(\(\d{3}\) \d{3}-\d{4})</li>", htmlStr)


#Now, find the names, and keep them linked to the phone numbers.
#We have to "match" two interveaning lines of address/city,state/zip
#but NOT capture those lines. We use a non-capturing group to find
#one or more line.
# NOTE: We have to use [^<]* to match everything up until the <br/>
# tags if we want to use exact numbers...if we use .* it will match many of them!


data = findall("<li>(\S+ \S+)<br/>[^<]*<br/>[^<]*<br/>(\(\d{3}\) \d{3}-\d{4})</li>", htmlStr)
for item in data:
   print(item)

print("Done!")