============================ Python: Programs (Solutions) ============================ #. Solution (one among many):: from pprint import pprint def readFile(filename): f = open(filename) couples = [] for l in f.readlines(): words = l.strip().split() couples.append([words[1], words[2]]) f.close() return couples def extractAlphabet(data): sequences = [] for element in data: sequences += element[1] chars = "".join(sequences) alphabet={} for c in chars: alphabet[c]=1 alphabet=alphabet.keys() alphabet.sort() return alphabet def computeProfile(sequence): profile={} for el in sequence: if not profile.has_key(el): profile[el] = 1 else: profile[el] += 1 return profile def printProfiles(data,alphabet): print "el " + " ".join(alphabet) for row in data: profile = computeProfile(row[1]) print_line = [str(row[0])] for l in alphabet: if profile.has_key(l): print_line.append(str(profile[l])) else: print_line.append("0") print(" ".join(print_line)) def main(): couples = readFile("alignment.txt") alphabet = extractAlphabet(couples) printProfiles(couples,alphabet) return main() #. Solution (one among many):: def readFile(filename): f=open(filename) struct=[] for row in f: if not row.startswith("FT"): continue data = row.split() if data[1] == 'STRAND' or data[1] == 'HELIX' or data[1] == 'TURN': struct.append((data[1],int(data[3])-int(data[2])+1)) f.close() return struct def computeAverageLengths(struct): lengths={} num={} for (ss,length) in struct: if not lengths.has_key(ss): lengths[ss]=length num[ss]=1 else: lengths[ss]+=length num[ss]+=1 return lengths,num def computeAverageConsecutive(struct): durations={} num={} ss=struct[0][0] dur=1 for i in range(1,len(struct)): if struct[i][0] == ss: dur+=1 else: if not durations.has_key(ss): durations[ss] = dur num[ss]=1 else: durations[ss] += dur num[ss]+=1 ss=struct[i][0] dur=1 durations[ss] += dur num[ss]+=1 return durations,num def normalize(values,sizes): normvalues={} for (k,v) in values.items(): normvalues[k] = float(values[k]) / sizes[k] return normvalues def computeSecondaryStructureStats(filename): struct=readFile(filename) print "average lengths " l1, l2 = computeAverageLengths(struct) print normalize(l1,l2) print "average number of consecutive occurrences " o1, o2 = computeAverageConsecutive(struct) print normalize(o1,o2) filename=raw_input("Data file: ") computeSecondaryStructureStats(filename) #. Solution (one among many):: def load_sites(filename): f = open(filename) pre_sites=[] post_sites=[] for row in f: if row[0] == '>': continue pre_sites.append(row[:7]) post_sites.append(row[7:]) f.close() return pre_sites,post_sites def compute_post_site_patterns(sites, maxlen): patterns={} for site in sites: for k in range(2,maxlen): pattern = site[:k] if not patterns.has_key(pattern): patterns[pattern]=1 else: patterns[pattern]+=1 return patterns def compute_pre_site_patterns(sites, maxlen): patterns={} for site in sites: for k in range(2,maxlen+1): pattern = site[-k:] if not patterns.has_key(pattern): patterns[pattern]=1 else: patterns[pattern]+=1 return patterns def filter_by_freq(freqs, threshold): filtered_freqs={} for (k,v) in freqs.items(): if v >= threshold: filtered_freqs[k]=v return filtered_freqs def splice_patterns(filename, threshold): pre_sites,post_sites = load_sites(filename) print filter_by_freq(compute_pre_site_patterns(pre_sites, 7), threshold) print filter_by_freq(compute_post_site_patterns(post_sites, 8), threshold) filename=raw_input("Nome file: ") threshold=int(raw_input("Soglia: ")) splice_patterns(filename, threshold) #. Solution (one among many):: def read_file(filename): f=open(filename) domains={} f.readline() for row in f.readlines(): data=row.split() domain=data[8] if not domains.has_key(domain): domains[domain]=[] domains[domain].append((data[0],int(data[1]),int(data[2]))) return domains def compute_average_length(domains): length4domain={} for (domain,entries) in domains.items(): avg_length=0. for (prot,start,end) in entries: avg_length+=end-start+1 avg_length/=len(entries) length4domain[domain]=avg_length return length4domain def compute_number_of_proteins(domains): prot4domain={} for (domain,entries) in domains.items(): prots={} for (prot,start,end) in entries: prots[prot]=1 prot4domain[domain]=len(prots) return prot4domain def print_statistics(length4domain,prot4domain): print "domain\t\t\tavg_length\tnum_prots" for domain in length4domain.keys(): print "%s\t\t\t%f\t%d" %(domain,length4domain[domain],prot4domain[domain]) def compute_domain_statistics(filename): domains = read_file(filename) lengths=compute_average_length(domains) prots=compute_number_of_proteins(domains) print_statistics(lengths,prots) filename=raw_input("Name of file: ") compute_domain_statistics(filename) #. Solution (one among many):: def readFile(filename): f=open(filename) seqs={} for row in f: if row[0] == '>': data=row[1:].strip().split(':') name,localization=data[0],data[1] if not seqs.has_key(localization): seqs[localization]=[] seqs[localization].append(name) return seqs def countorganism_by_localization(prots): orgbyloc={} for (localization,data) in prots.items(): orgs={} for name in data: org = name.split("_")[1] if not orgs.has_key(org): orgs[org]=1 else: orgs[org]+=1 orgbyloc[localization]=orgs return orgbyloc def print_counts(counts_by_loc): for localization,counts in counts_by_loc.items(): print localization countlist=[(count,org) for (org,count) in counts.items()] countlist.sort() countlist.reverse() print " ".join(["%d:%s" %(count,org) for (count,org) in countlist]) def compute_organismcount_by_localization(filename): prots=readFile(filename) counts_by_loc=countorganism_by_localization(prots) print_counts(counts_by_loc) filename=raw_input("Name of file: ") compute_organismcount_by_localization(filename) #. Solution (one among many):: def loadRNA(filename): f=open(filename) rna={} for row in f: data=row.split() rna[data[0]]=data[1] f.close() return rna def findMatches(seq,subseq): matches=[] start=0 while start != -1: start=seq.find(subseq,start) if start > -1: matches.append(start) start+=1 return matches def findAllMatches(rna,subseq): for (name,seq) in rna.items(): matches=findMatches(seq,subseq) if matches: print name,matches def complementary(pattern): comp={"a" : "t", "t" : "a", "c" : "g", "g" : "c"} return "".join([comp[p] for p in pattern]) def findPerfectMatches(filename,pattern): rna=loadRNA(filename) comppattern=complementary(pattern) findAllMatches(rna,comppattern) findPerfectMatches('utr.txt','acgaatt') #. Midterm exam solution (one among many):: def read_hierarchy(path): fp = open(path) hierarchy = {} for line in fp.readlines()[1:]: prot, dom, res, aa = line.strip().split() if not hierarchy.has_key(prot): hierarchy[prot] = {} if not hierarchy[prot].has_key(dom): hierarchy[prot][dom] = [] hierarchy[prot][dom].append(aa) fp.close() return hierarchy def intersect(list1, list2): intersection = [] for el1 in list1: if el1 in list2: intersection.append(el1) intersection.sort() return intersection def histogram(seq): counts = {} for char in seq: if not counts.has_key(char): counts[char] = 0 counts[char] += 1 return counts def print_shared_domain_stats(data, pi, pj): domainsi = data[pi].keys() domainsj = data[pj].keys() shared = intersect(domainsi, domainsj) print pi, pj, "shared domains:", shared if len(shared) == 0: return for domain in shared: seqi = "".join(data[pi][domain]) seqj = "".join(data[pj][domain]) print domain, "histogram =", histogram(seqi + seqj) def run(): path = raw_input("insert the path to the hierarchy file: ") hierarchy = read_hierarchy(path) if raw_input("all pairs? (Y or N) ") == "N": p1 = raw_input("insert 1st protein ID: ") p2 = raw_input("insert 2nd protein ID: ") print_shared_domain_stats(hierarchy, p1, p2) else: proteins = hierarchy.keys() proteins.sort() for i in range(len(proteins)): for j in range(i + 1, len(proteins)): print_shared_domain_stats(hierarchy, proteins[i], proteins[j]) run()