# Python: Programs (Solutions)ΒΆ

1. Solution (one among many):

from pprint import pprint

f = open(filename)
couples = []
words = l.strip().split()
couples.append([words[1], words[2]])
f.close()
return couples

def extractAlphabet(data):
sequences = []
for element in data:
sequences += element[1]
chars = "".join(sequences)
alphabet={}
for c in chars:
alphabet[c]=1
alphabet=alphabet.keys()
alphabet.sort()
return alphabet

def computeProfile(sequence):
profile={}
for el in sequence:
if not profile.has_key(el):
profile[el] = 1
else:
profile[el] += 1
return profile

def printProfiles(data,alphabet):
print "el " + " ".join(alphabet)
for row in data:
profile = computeProfile(row[1])
print_line = [str(row[0])]
for l in alphabet:
if profile.has_key(l):
print_line.append(str(profile[l]))
else:
print_line.append("0")
print(" ".join(print_line))

def main():
alphabet = extractAlphabet(couples)
printProfiles(couples,alphabet)
return

main()

2. Solution (one among many):

def readFile(filename):
f=open(filename)
struct=[]
for row in f:
if not row.startswith("FT"):
continue
data = row.split()
if data[1] == 'STRAND' or data[1] == 'HELIX' or data[1] == 'TURN':
struct.append((data[1],int(data[3])-int(data[2])+1))
f.close()
return struct

def computeAverageLengths(struct):
lengths={}
num={}
for (ss,length) in struct:
if not lengths.has_key(ss):
lengths[ss]=length
num[ss]=1
else:
lengths[ss]+=length
num[ss]+=1
return lengths,num

def computeAverageConsecutive(struct):
durations={}
num={}
ss=struct[0][0]
dur=1
for i in range(1,len(struct)):
if struct[i][0] == ss:
dur+=1
else:
if not durations.has_key(ss):
durations[ss] = dur
num[ss]=1
else:
durations[ss] += dur
num[ss]+=1
ss=struct[i][0]
dur=1
durations[ss] += dur
num[ss]+=1
return durations,num

def normalize(values,sizes):
normvalues={}
for (k,v) in values.items():
normvalues[k] = float(values[k]) / sizes[k]
return normvalues

def computeSecondaryStructureStats(filename):
print "average lengths "
l1, l2 = computeAverageLengths(struct)
print normalize(l1,l2)
print "average number of consecutive occurrences "
o1, o2 = computeAverageConsecutive(struct)
print normalize(o1,o2)

filename=raw_input("Data file: ")
computeSecondaryStructureStats(filename)

3. Solution (one among many):

def load_sites(filename):
f  = open(filename)
pre_sites=[]
post_sites=[]

for row in f:
if row[0] == '>':
continue
pre_sites.append(row[:7])
post_sites.append(row[7:])

f.close()
return pre_sites,post_sites

def compute_post_site_patterns(sites, maxlen):
patterns={}
for site in sites:
for k in range(2,maxlen):
pattern = site[:k]
if not patterns.has_key(pattern):
patterns[pattern]=1
else:
patterns[pattern]+=1
return patterns

def compute_pre_site_patterns(sites, maxlen):
patterns={}
for site in sites:
for k in range(2,maxlen+1):
pattern = site[-k:]
if not patterns.has_key(pattern):
patterns[pattern]=1
else:
patterns[pattern]+=1
return patterns

def filter_by_freq(freqs, threshold):
filtered_freqs={}
for (k,v) in freqs.items():
if v >= threshold:
filtered_freqs[k]=v
return filtered_freqs

def splice_patterns(filename, threshold):
print filter_by_freq(compute_pre_site_patterns(pre_sites, 7), threshold)
print filter_by_freq(compute_post_site_patterns(post_sites, 8), threshold)

filename=raw_input("Nome file: ")
threshold=int(raw_input("Soglia: "))

splice_patterns(filename, threshold)

4. Solution (one among many):

def read_file(filename):
f=open(filename)
domains={}
data=row.split()
domain=data[8]
if not domains.has_key(domain):
domains[domain]=[]
domains[domain].append((data[0],int(data[1]),int(data[2])))
return domains

def compute_average_length(domains):
length4domain={}
for (domain,entries) in domains.items():
avg_length=0.
for (prot,start,end) in entries:
avg_length+=end-start+1
avg_length/=len(entries)
length4domain[domain]=avg_length
return length4domain

def compute_number_of_proteins(domains):
prot4domain={}
for (domain,entries) in domains.items():
prots={}
for (prot,start,end) in entries:
prots[prot]=1
prot4domain[domain]=len(prots)
return prot4domain

def print_statistics(length4domain,prot4domain):
print "domain\t\t\tavg_length\tnum_prots"
for domain in length4domain.keys():
print "%s\t\t\t%f\t%d" %(domain,length4domain[domain],prot4domain[domain])

def compute_domain_statistics(filename):
lengths=compute_average_length(domains)
prots=compute_number_of_proteins(domains)
print_statistics(lengths,prots)

filename=raw_input("Name of file: ")
compute_domain_statistics(filename)

5. Solution (one among many):

def readFile(filename):
f=open(filename)
seqs={}
for row in f:
if row[0] == '>':
data=row[1:].strip().split(':')
name,localization=data[0],data[1]
if not seqs.has_key(localization):
seqs[localization]=[]
seqs[localization].append(name)
return seqs

def countorganism_by_localization(prots):
orgbyloc={}
for (localization,data) in prots.items():
orgs={}
for name in data:
org = name.split("_")[1]
if not orgs.has_key(org):
orgs[org]=1
else:
orgs[org]+=1
orgbyloc[localization]=orgs
return orgbyloc

def print_counts(counts_by_loc):
for localization,counts in counts_by_loc.items():
print localization
countlist=[(count,org) for (org,count) in counts.items()]
countlist.sort()
countlist.reverse()
print " ".join(["%d:%s" %(count,org) for (count,org) in countlist])

def compute_organismcount_by_localization(filename):
counts_by_loc=countorganism_by_localization(prots)
print_counts(counts_by_loc)

filename=raw_input("Name of file: ")
compute_organismcount_by_localization(filename)

6. Solution (one among many):

def loadRNA(filename):
f=open(filename)
rna={}
for row in f:
data=row.split()
rna[data[0]]=data[1]
f.close()
return rna

def findMatches(seq,subseq):
matches=[]
start=0
while start != -1:
start=seq.find(subseq,start)
if start > -1:
matches.append(start)
start+=1
return matches

def findAllMatches(rna,subseq):
for (name,seq) in rna.items():
matches=findMatches(seq,subseq)
if matches:
print name,matches

def complementary(pattern):
comp={"a" : "t", "t" : "a", "c" : "g", "g" : "c"}
return "".join([comp[p] for p in pattern])

def findPerfectMatches(filename,pattern):
comppattern=complementary(pattern)
findAllMatches(rna,comppattern)

findPerfectMatches('utr.txt','acgaatt')

7. Midterm exam solution (one among many):

    def read_hierarchy(path):
fp = open(path)
hierarchy = {}
prot, dom, res, aa = line.strip().split()
if not hierarchy.has_key(prot):
hierarchy[prot] = {}
if not hierarchy[prot].has_key(dom):
hierarchy[prot][dom] = []
hierarchy[prot][dom].append(aa)
fp.close()
return hierarchy

def intersect(list1, list2):
intersection = []
for el1 in list1:
if el1 in list2:
intersection.append(el1)
intersection.sort()
return intersection

def histogram(seq):
counts = {}
for char in seq:
if not counts.has_key(char):
counts[char] = 0
counts[char] += 1
return counts

def print_shared_domain_stats(data, pi, pj):
domainsi = data[pi].keys()
domainsj = data[pj].keys()

shared = intersect(domainsi, domainsj)
print pi, pj, "shared domains:", shared
if len(shared) == 0:
return

for domain in shared:
seqi = "".join(data[pi][domain])
seqj = "".join(data[pj][domain])
print domain, "histogram =", histogram(seqi + seqj)

def run():
path = raw_input("insert the path to the hierarchy file: ")

if raw_input("all pairs? (Y or N) ") == "N":
p1 = raw_input("insert 1st protein ID: ")
p2 = raw_input("insert 2nd protein ID: ")
print_shared_domain_stats(hierarchy, p1, p2)
else:
proteins = hierarchy.keys()
proteins.sort()
for i in range(len(proteins)):
for j in range(i + 1, len(proteins)):
print_shared_domain_stats(hierarchy, proteins[i], proteins[j])

run()