Lists and Aliasing

visualizer

X = [0, 5, "cat", 2]
Z = X
print(Z)
X[0] = "dog"
print(X)
print(Z)

X = [0, 5, "cat", 2]
Z = [5, 6, 7, 8]
X = Z
print(X)
print(Z)
#Question: After the line X = Z, is there any line
#to recover the list containing "cat"?

def change_up(A, B, C):
    C[0] = B[0]
    B = A    
    print(A)
    print(B)
    print(C)

X = ["he's", "got", "attitude"]
Y = ["bears", "love", "Thursdays"]
Z = ["never", "eat", "hotdogs"]

change_up(X, Y, Z)
print(X)
print(Y)
print(Z)

Dictionaries

L = []
D = {}
print(type(L))
print(type(D))
#L[4] = 40  Why does this cause an error if uncommented?
D[4] = 40
print(D)
for i in range(5):
    L.append(i*10)
    D[i] = i*10
print(L)
print(D)

D = {}
D["tacos"] = "delicious"
D["tacos"] = 30
D[9] = 99
print(D)
D[[1, 2, 3]] = 6 #why does this cause an error?

D = {}
D['uno'] = 'one'
D['dos'] = 'two'
D['tres'] = 'three'
print(D.keys())
print(D.values())
print('uno' in D)
print('one' in D)
print('one' in D.values())
del D['dos']
print(D)

D = {}
D['Vancouver'] = 'Canucks'
D['Calgary'] = 'Flames'
D['Atlanta'] = 'Thrashers'
D2 = D
del D2['Atlanta']
D2['Winnipeg'] = 'Jets'
print(D)
print(D2)

D = {}
D['Vancouver'] = 'Canucks'
D['Calgary'] = 'Flames'
D['Atlanta'] = 'Thrashers'
D2 = D.copy()
del D2['Atlanta']
D2['Winnipeg'] = 'Jets'
print(D)
print(D2)

some_dict["hats"] = ["straw", "cloth", "wood", "brick"] #Why does this cause an error? How do you fix it?
print(some_dict)

P = {}
P['Richard Nixon']  = 2
P['Gerald Ford'] = 1
P['Jimmy Carter']  = 1
P['Ronald Reagan']  = 2
P['George Bush']  = 1
P['Bill Clinton']  = 2
P['George W. Bush']  = 2
P['Barack Obama']  = 2

#The get method returns the value in the dictionary
#if the key exists, and returns the second argument
#to get otherwise

#It's a graceful way of handling keys that don't
#exist in the dictionary

print(P.get('Jimmy Carter', 0))
print(P.get('Josh Hug', 0))
print(P.get('Jimmy Carter', 5884874))

letter_counter = {}
for letter in 'syzygy':
    letter_counter[letter] = letter_counter.get(letter, 0) + 1
print(letter_counter)

Tuples

T = ('alabama', 'alaska', 'arizona', 'arkansas')   
print(T[1:3])
T = T + T #why does this not cause an error?
print(T)

a, b, c = ("one", "two", "three")
print(b)
a, b, c = "one", "two", "three"
print(b)

def return_powers(x):
    square = x ** 2
    cube = x ** 3
    return square, cube

s, c = return_powers(5)
print(s)
print(c)
x = return_powers(5)
print(x)
print(type(x))

Sets

What is the output of the following program?

S = set(["ranchero", "sauce", "bean", "sauce"])
print(S)
S = set("syzygy")
X = set("gyz")
print(S)
print(X)
print(S.issubset(X))
print(X.issubset(S))

Another Look at Strings

The full documentation for strings can be found at: http://docs.python.org/2/library/string.html

import string
s = "a horse is a horse of course, of course, horse, horse, horse"
print(string.find(s, "horse"))
print(s.find("horse") )
print(s.find("zebra") )
print(s.find("horse", 10) )
print(s.find("horse", 10, 14) )

import string
s = "a horse is a horse of course, of course"
words = s.split()
print(words)
print(type(words))
count = 0
for word in words:
    if (word == "horse"):
        count = count + 1
print(count)
print(s.count("horse"))
print(string.count(s, "horse"))
print(s.replace("horse", "zebra"))
s.replace("horse", "potato")
print(s)

s = "        this desert is devoid of any water   "
print(s.lstrip())
print(s.rstrip())
print(s.strip())
print(s.startswith("this"))
print(s.strip().startswith("this"))

Note, if you run this one in Canopy, you might need to do Run->Restart Kernel to get it to terminate the loop.

while (True):
    user_string = raw_input("Give me an integer: ")
    if user_string.isdigit():
        print(int(user_string) ** 2)
    else:
        print("Non-integer value entered.")

starter = "BCDFHGJKLMNP"
cnt = 0
for letter in starter:
    outputstr = ("%cing %cang %cingalong" % (letter, letter, letter))
    if (cnt % 3 == 2):
        outputstr = outputstr + " " + letter + "ay."
    else:
        outputstr = outputstr + ", "
    print(outputstr)
    cnt = cnt + 1

Exceptions

while (True):
    try:
        user_string = raw_input("Give me a number: ")
        print(float(user_string) ** 2)
    except:
        print("Non-numeric value entered.")

try:
    open("iafuhweoiuahwlieufhaweufh.txt", "r")
    print("Success")
except IOError as e:
    print(e)
    print(type(e))

try:
    int(66.4)
    int("dog")
    d[0] = 5
except Exception as e:
    print(type(e))
    print(e)

Coding exercises

Write a function is_valid_sequence(s) that takes a string as input and returns true if the string contains only the letters A, C, T, or G. You should accept either uppercase or lowercase letters. For example:
print(is_valid_sequence('CACGTGGC')) print(is_valid_sequence('CAcggtgcA')) print(is_valid_sequence('TAGCCQ'))
True True False

Write a function is_valid_sequence_dictionary(D) that takes a dictionary as input. The dictionary maps binding site names to sequences. For example:

D = {'ABF': 'CACGTGGC', 'ABRE': 'TACGTGGC', 'LTRE': 'ACCGACA', 'ERE': 'TAAGAGCCGCC'}
print(is_valid_sequence_dictionary(D))
D2 = {'Spo0A': 'GTCGHAHEGQ', 'Spo0B': 'GATGA'}
print(is_valid_sequence_dictionary(D2))


True

False

Write a function sorted_keys(D) that takes a dictionary as input and returns a list of the keys in sorted order.

D = {'california': 38041430, 'texas': 26059201, 'new york': 19570261, 'florida': 19317568}
print(sorted_keys(D))


['california', 'florida', 'new york', 'texas']

Write a function GC_content(s) that takes a DNA sequence as input and returns a floating point number representing the percentage of the bases that are either Gs or Cs. If the given string is not a valid_sequence, GC_content should return None instead.
print(GC_content("GCGCGC")) print(GC_content("TATA")) print(GC_content("TATAGCGCGC")) print(GC_content("TATAMMM"))
100.0 0.0 60.0 None

Design Case Study

This was covered in lecture. It is provided again here for your reference later in life (or if you missed lecture (or if you're just some random person on the internet)).

Being able to grab data from the web is a nice thing. For example, suppose we want to know the latitude and longitude of a city. In this case study, I'll go through the design of a tool that collects this data from wikipedia.

The first step in developing such a tool is to figure out how to generate the URL for a given city. In lab3, we saw (briefly) an example of reading from a wikipedia page. The code below reads the entire text of the article about Eduard Khil and stores it in a string.

url = 'http://en.wikipedia.org/w/index.php?action=raw&title=Eduard_Khil'
try:
    u = urllib2.urlopen(url)
    s = u.read()
except:
    print "Could not read " + url

Our goal is to do something very similar, but for the city of our choice. We start by googling "los angeles". The first hit is the wikipedia page, which is given by http://en.wikipedia.org/wiki/Los_Angeles.

We could proceed from here by simply taking our Eduard Khil URL and copying the beginning part, yielding url = 'http://en.wikipedia.org/w/index.php?action=raw&title=Los_Angeles'.

However, we don't want to have to use google every time we use our program. Instead, we'll write Python code that is able to take a string and generate a URL. We make an assumption that the URL is simply the same thing as the name of the string but with underscores instead of spaces. We simply hope that capitalization does not matter, knowing that we can fix this later if necessary.

def generate_city_URL(city_name):
    underscore_city_name = city_name.replace(' ', '_')
    return 'http://en.wikipedia.org/w/index.php?action=raw&title=' + underscore_city_name

The next step in this process is test this function. We open up Canopy or Idle and save the function above in a file and run a few tests (tests not shown).

Now that we can get the data from the website, we go to the page in our web browser and manually look through all the data to identify the location of the information we're after. Somewhat near the beginning of the page, we see the following:

|timezone = [[Pacific Time Zone|PST]]
|utc_offset = -8
|timezone_DST = PDT
|utc_offset_DST = −7
|latd = 34
|latm = 03
|lats =
|latNS = N
|longd = 118
|longm = 15
|longs =
|longEW = W
|elevation_m = 71
|elevation_ft = 233 (city hall)

The latitude in degrees seems to be on a line by itself which starts with |latd = . To make sure that this pattern persists for other cities, we check the wikipedia entry for New York City.

We see that it doesn't quite match the same pattern. In particular, the latitude appears in the following context:

| pushpin_label           = New York
| pushpin_map_caption     = Location in the [[United States]]
| latd  = 40 |latm  = 40.2 |latNS  = N 
| longd = 73 |longm = 56.4 |longEW = W
| coor_pinpoint           = 
| coordinates_region      = US-NY
| coordinates_type        =

We note some important differences. Whereas the Los Angeles article has each part of the location on a separate line, New York puts everything having to do with latitude on the same line. And whereas Los Angeles includes a blank entry for the latitude in seconds, New York omits this line entirely.

We could look at more cities to try to abstract the pattern, but we'll make the risky move of assuming that the latitude in degrees is always preceeded by "|latd = ". It's probably unsafe to assume anything at all about whitespace, so our code will be written to ignore all whitespace.

Likewise, I don't personally feel so comfortable that everything will appear in the same order, so I'll be reading in the entire page as a single string using read(), as opposed to going through the string line by line using readline().

Since we want to read in the entire web page as a single string, this provides us with the next natural function to implement, namely fetch_website(URL) which takes a URL and returns a string containing the data from the wikipedia page. This is a relatively straightforward function to write using lab3 as a guide:

def fetch_website(URL):
    try:
        u = urllib2.urlopen(URL)
        s = u.read()
        u.close()
        return s
    except:
        return None

We again test our function before proceeding (tests not shown). We observe that we can get all of the information about nyc on wikipedia into a string with a simple pair of function calls:

nyc_data_string = fetch_website(generate_city_URL('New York City'))

Our next task is to take what we learned earlier about the format of the data. In particular, we'll write a function called get_latd(s) which gets the latitude in dgrees of a city from a string.

We have to be both careful and daring here. We have to guess the pattern that all (or at least most) wikipedia pages follow. If we're wrong, it's no big deal, we'll just adapt our code to match the violations of our assumption.

Using | latd = 40 |: It appears that the fundamental thing that gives away the latitude is that it's a numerical string that appears between "| latd =" and "|", but we'll ignore whitespace, since this is relatively easy and likely to improve the robustness of our code. It is entirely possible that some wikipedia pages may violate this assumption, but we'll deal with that later since we're just trying to build a prototype.

Thus, before we can move on, we want to develop code that takes a string and removes all whitespace.

We first observe that we cannot use our new fancy strip() methods, because strip() only removes whitespace on either side of a string. One approach would be to do something like s.replace(" "). An example of such a function is given below:

def remove_whitespace(s):
    s = s.replace(" ", "")
    s = s.replace("\t", "")
    s = s.replace("\n", "")
    s = s.replace("\r", "")
    return s

While this should work, it seems a little kludgy. The right thing to do in this circumstance is to google something like "remove all whitespace string python", and you'll almost certainly find yourself at the incredibly useful stackoverflow.com. If the task is super simple to state like this, someone has almost certainly posted about it.

We see that someone suggests that this task can be accomplished with a single somewhat mysterious line of code "".join(s.split()). This is a pretty tricky piece of code to come up with, particularly since you just learned about join and split today!

Another mysterious suggestion from Stack Overflow is to use:

import re
re.sub(r'\s+', '', s)

Not being a Python expert, this is probably the approach I'd use if I were coding, but we haven't learned regular expressions yet (probably week 6 or 7?) so it's obviously just some random garbage to you right now. At any rate, we now have two highly voted solutions, and you should feel free to copy and paste even mysterious code like this into your code. It might break, sure, but if it has a ton of upvotes, it's probably right. If you're particularly unsure, you can go to the Python documentation and learn WHY the code works. But again, often when prototyping, you don't really care about robustness at all.

I do think understanding "".join(s.split()) is pretty neat, and I encourage you to break this out into a few separate lines of code so you can understand why it works to remove all whitespace.

It is probably a good idea to write comments in your code that cite your sources. This is partially to give credit where credit is due, but it is actually mostly so that future people working on the code (possibly including yourself months or years later) know where to go looking if something is broken. This leaves us with the remove_whitespace(s) method given below.

#As per this post on Stack Overflow:
#http://stackoverflow.com/questions/8270092/python-remove-whitespaces-in-string
def remove_whitespace(s):
    return re.sub(r'\s+', '', s)

We follow up by testing our method on a few strings, again not shown. Now that we have a method that removes whitespaces, we can finally move on to get_latd(s).

One choice we have to make is whether or not the string that is input to get_latd already has the spaces removed. There's really no right or wrong choice. I've arbitraily chosen that it should already have spaces removed.

From here, our first goal is to find "|latd=". This is easy, as we can simply use the string method string.find. Our code should start out something like:

def get_latd(s):
    latd_marker_location = s.find("|latd=")

Since the length of "|latd=" is 5, we expect that latd_marker_location+5 will be the location of the first digit of the latitude.

def get_latd(s):
    latd_marker_location = s.find("|latd=")
    latd_start = latd_marker_location + 5

This is not the best coding style. The 5 is what is sometimes called a 'magic number' -- i.e. some random number in code that has no clear purpose or meaning. We could fix this by adding a comment explaining the situation, but it's better to write code that is self-documenting where possible, which we can achieve by assigning things to variables with obvious names. We should also add comments just to be safe. We can thus augment our function as follows:

def get_latd(s):
    #The latitude appears to always be preceeded by |latd=
    latd_marker_string = "|latd="
    
    latd_marker_location = s.find(latd_marker_string)

    #latd_start is the location of the first digit of the latitude in degrees
    latd_start = latd_marker_location + len(latd_marker_string)

It's probably best to follow up by testing that latd_start is indeed the location of the first digit. To do this, we'll simply add a temporary print statement for debugging purposes (we'll discuss fancier debugging techniques later in the class).

def get_latd(s):
    #The latitude appears to always be preceeded by |latd=
    latd_marker_string = "|latd="

    latd_marker_location = s.find(latd_marker_string)

    #latd_start is the location of the first digit of the latitude in degrees
    latd_start = latd_marker_location + len(latd_marker_string)

    #print the 50 characters starting with latd_start
    print(s[latd_start:latd_start+100])

get_latd(remove_whitespace(fetch_website(generate_city_URL('New_York_City'))))

The test above prints out 40|latm=40.2|latNS=N|longd=73|longm=56.4|longEW=W|coor_pi. Looking at the paeg in our browser, we see that indeed, this appears to be correct. Hoorah! If it had been wrong, we would have had a bit more work to do. Often, when you're starting off you'll find that you're off by one, so you'll just have to adjust your code to make sure it works. If you're just trying to get something working in a hurry, it can be ok to 'guess-and-check code', where you add a +1 or a -1 without really thinking. However, this is NOT a good idea for developing software in general, as it can lead to really insidious and impossible to find bugs, and anyone reading your code will want to punch you in the eye when they see magic +1s/-1s.

Now we want to find the "|" that putatively follows the latitude. To do this, we'll use find again. Perhaps we do something like:

def get_latd(s):
    #The latitude appears to always be preceeded by |latd=
    latd_marker_string = "|latd="

    latd_marker_location = s.find(latd_marker_string)

    #latd_start is the location of the first digit of the latitude in degrees
    latd_start = latd_marker_location + len(latd_marker_string)

    latd_end_marker_string = '|'
    latd_end = s.find(latd_end_marker_string)

    #print the 100 characters starting with latd_start
    print(s[latd_start:latd_end])

get_latd(remove_whitespace(fetch_website(generate_city_URL('New_York_City'))))

When we run the code above, we see nothing get printed! Gaaah! What has happened! We add a print statement to try and diagnose the problem:

def get_latd(s):
    #The latitude appears to always be preceeded by |latd=
    latd_marker_string = "|latd="

    latd_marker_location = s.find(latd_marker_string)

    #latd_start is the location of the first digit of the latitude in degrees
    latd_start = latd_marker_location + len(latd_marker_string)

    latd_end_marker_string = '|'
    latd_end = s.find(latd_end_marker_string)

    print("latd_start is: %d, and latd_end is: %d" % (latd_start, latd_end))    
    
    print(s[latd_start:latd_end])

get_latd(remove_whitespace(fetch_website(generate_city_URL('New_York_City'))))

This gives us latd_start is: 2082, and latd_end is: 26. Looking back at the page in our browser, we see the problem, there's a | way early in the file! We forgot to tell find to only start from latd.

We modify our find code to look for "|" only starting from latd_start.

def get_latd(s):
    #The latitude appears to always be preceeded by |latd=
    latd_marker_string = "|latd="

    latd_marker_location = s.find(latd_marker_string)

    #latd_start is the location of the first digit of the latitude in degrees
    latd_start = latd_marker_location + len(latd_marker_string)

    latd_end_marker_string = '|'
    latd_end = s.find(latd_end_marker_string, latd_start)

    print("latd_start is: %d, and latd_end is: %d" % (latd_start, latd_end))    
    
    print(s[latd_start:latd_end])

get_latd(remove_whitespace(fetch_website(generate_city_URL('New_York_City'))))

This time we get latd_start is: 2082, and latd_end is: 2084, and the second print statment gives us 40.

This means our code seems to work, so we finish off our code as follows:

def get_latd(s):
    #The latitude appears to always be preceeded by |latd=
    latd_marker_string = "|latd="

    latd_marker_location = s.find(latd_marker_string)

    #latd_start is the location of the first digit of the latitude in degrees
    latd_start = latd_marker_location + len(latd_marker_string)

    latd_end_marker_string = '|'
    latd_end = s.find(latd_end_marker_string, latd_start)
    
    try:
        return int(s[latd_start:latd_end])
    except:
        return None

Above, I've set up get_latd to return an integer. Good software design practice is to decide this before you even start writing your function, but for this case study, I was a little sloppy and made this decision later. Likewise, my error behavior (returning None if something goes wrong) was made in a post hoc manner in the example above, but good software design would state this up front before the function development begins.

One very important point is that we have not guaranteed that get_latd will work correctly. In fact, this is actually an impossible task. In this example, we don't know that all wikipedia pages will obey the pattern that we've implicitly captured in our code. And in general, automated software verification is actually a provably impossible task -- indeed, this is the root of some very deep and interesting theory in its own right.

In reality, we'd fill out our module so that we end up getting the entire latitude and longitude instead of just the degrees part of the latitude. If you're interested, I leave this up to you as an exercise.

The module we've built so far is a bit unwiedly to the end user. Suppose someone wants to use our module to calculate the great circle and tunnel distances between two cities. Along the way, they'd have to make awkward function calls like get_latd(remove_whitespace(fetch_website(generate_city_URL('New_York_City')))). The get_latd, remove_whitespace, fetch_website, and generate_city_URL functions are probably not things that such a programmer would care about.

In general, your modules will have functions that are meant for outside use, and others that are for use by functions inside the module itself. These are often called public and private functions, though this distinction is less firm in Python than it is in other languages like C++ or Java. We'll talk about this more in a future class.

For now, let's just define one more function:

get_latitude(city_name):
    city_URL = generate_city_URL(city_name)
    website_data = fetch_website(city_URL)
    website_data = remove_whitespace(website_data)
    return get_latd(website_data)

Again, a real implementation would also include the latm, lats, and latNS data. Furthermore, an industrial strength module should also have error messages to assist the user when things go wrong.

This process leads us to the final module shown below:

import urllib2
import re

def generate_city_URL(city_name):
    underscore_city_name = city_name.replace(' ', '_')
    return 'http://en.wikipedia.org/w/index.php?action=raw&title=' + underscore_city_name
    
    
def fetch_website(URL):
    try:
        u = urllib2.urlopen(URL)
        s = u.read()
        u.close()
        return s
    except:
        return None
       
#As per this post on Stack Overflow:
#http://stackoverflow.com/questions/8270092/python-remove-whitespaces-in-string
def remove_whitespace(s):
    return re.sub(r'\s+', '', s)
        
def get_latd(s):
    #The latitude appears to always be preceeded by |latd=
    latd_marker_string = "|latd="

    latd_marker_location = s.find(latd_marker_string)

    #latd_start is the location of the first digit of the latitude in degrees
    latd_start = latd_marker_location + len(latd_marker_string)

    latd_end_marker_string = '|'
    latd_end = s.find(latd_end_marker_string, latd_start)
    
    try:
        return int(s[latd_start:latd_end])
    except:
        return None
             
def get_latitude(city_name):
    city_URL = generate_city_URL(city_name)
    website_data = fetch_website(city_URL)
    website_data = remove_whitespace(website_data)
    return get_latd(website_data)              
                  
print(get_latitude("New York City"))

#Below are commented out tests used during development
#nyc_url = generate_city_URL('New York City')
#nyc_data = fetch_website(nyc_url)
#print(nyc_data)
#print(remove_whitespace(nyc_data))
#get_latd(remove_whitespace(fetch_website(generate_city_URL('New_York_City'))))

To summarize the design process:

Manually figure out the pattern used by Wikipedia for URLs for cities.
Write code that generates a URL from a city name.
Manually look at the data on the website (for some applications you may need to 'view source' in your browser) and determine the pattern in the text that surrounds the latitude.
Develop a function that fetches data from a URL and returns a string containing all data at that URL.
Develop a function that removes all whitespace from a string, assuming that this will improve robustness.
Develop a function that identifies any characters between "|latd=" adnd "|" in a string, and returns an integer corresponding to those characters. If the characters in between are not an integer, return None instead. Important note: It's a good idea to define your functions before you write them, particularly if you're working on a project with other people. Having a concise specification of the inputs and outputs of each function is vital to proper software engineering.
Develop a function for use by the end user that doesn't involve having to think about URLs or strings at all. There are conventions in Python for naming public and private functions that we'll discuss in the next couple of weeks.