# Pandas: Solutions¶

1. Solutions:

import pandas as pd
import matplotlib.pyplot as plt

1. Solution:

# number of rows and columns
print df_iris.shape

# only rows
print len(df_iris.index)

# only columns
print len(df_iris.columns)

2. Solution:

# average petal length
print df_iris.PetalLength.mean()

3. Solution:

# average of all numeric columns
print df_iris.mean()

4. Solution:

# rows corresponding to petal length outliers
print df_iris[df_iris.PetalLength > 1.5*df_iris.PetalLength.mean()]

5. Solution:

# group-wise standard deviation
grouped = df_iris.groupby(df_iris.Name)
iris_std_by_name = grouped.aggregate(pd.DataFrame.std)
print iris_std_by_name

# or in a single row
print df_iris.groupby(df_iris.Name).aggregate(pd.DataFrame.std)

6. Solution:

# grouped outliers
for name, dataframe in df_iris.groupby(df_iris.Name):
print dataframe[dataframe.PetalLength > 1.5*df_iris.PetalLength.mean()]

7. Solution:

# WARNING: there are *no* group-wise outliers!

# group-wise outliers
for name, dataframe in df_iris.groupby(df_iris.Name):
print dataframe[dataframe.PetalLength > 1.5*dataframe.PetalLength.mean()]

# using groupby-aggregate-merge
grouped = df_iris.groupby(df_iris.Name, as_index=False)
petlen_mean_by_name = grouped.aggregate(pd.DataFrame.mean)
merged = pd.merge(iris, petlen_mean_by_name, on="Name")
# use print merged.columns to take a peek at the column names; the
# column_x columns come from the left table (i.e. the original iris
# DataFrame), the column_y ones come # from the right table (i.e. the
# group-wise means)
print merged[merged.PetalLength_x > 1.5 * merged.PetalLength_y]

2. Solutions:

import pandas as pd
import matplotlib.pyplot as plt

1. Solution:

# number of genes (there is exactly one per row)
print df_gene.shape

2. Solution:

# minimum, maximum, average and median number of isoforms per gene
print df_gene.transcript_count.min(), \
df_gene.transcript_count.max(), \
df_gene.transcript_count.mean(), \
df_gene.transcript_count.median()

3. Solution:

# plot a histogram of the number of known isoforms per gene
df_gene.transcript_count.plot(kind="hist",bins=100)
plt.show()

4. Solution:

# computes, for each gene_biotype, the number of associated genes
grouped = df_gene.groupby(df_gene.gene_biotype)
grouped_number_by_biotype = grouped.aggregate(pd.DataFrame.count)
print grouped_number_by_biotype

5. Solution:

rows = []
for biotype, subdf in df_gene.groupby(df_gene.gene_biotype):
rows.append((len(subdf), biotype))

# alternative 1
rows.sort()
rows.reverse()
print rows

print sorted(rows, reverse=True)

6. Solution:

print len(df_gene.chromosome.unique())

# or equivalently (the result is a numpy array)
print df_gene.shape

7. Solution: almost identical to the solution of point 5.

8. Solution:

for chromosome, subdf in df_gene.groupby(df_gene.chromosome):
num_plus = float(len(subdf[subdf.strand == "+"]))
perc_plus = 100 * num_plus / len(subdf)
print chromosome, perc_plus

9. Solution:

df_gene.groupby(df_gene.gene_biotype, as_index=False) \
.aggregate(pd.DataFrame.mean)[['gene_biotype', 'transcript_count']]

3. Solutions:

import pandas as pd
import matplotlib.pyplot as plt

1. Solution:

print df_tt.shape

2. Solution:

# minimum, maximum, average and median length of human transcripts
print df_tt.transcript_length.min(), \
df_tt.transcript_length.max(), \
df_tt.transcript_length.mean(), \
df_tt.transcript_length.median()

3. Solution:

# minimum, maximum, average and median length of the CDS of human transcripts (excluding values equal to 0)
print (df_tt.cds_length[df_tt.cds_length>0]).min(), \
(df_tt.cds_length[df_tt.cds_length>0]).max(), \
(df_tt.cds_length[df_tt.cds_length>0]).mean(), \
(df_tt.cds_length[df_tt.cds_length>0]).median()

4. Solution:

# computes the percentage of human transcripts with a CDS length that is a multiple of 3 (excluding values equal to 0)
cds_number = df_tt.cds_length[df_tt.cds_length>0].shape
cds_multiple_number = df_tt.cds_length[df_tt.cds_length>0][df_tt.cds_length%3==0].shape
print (float(cds_multiple_number)/float(cds_number))*100
# Is the percentage equal to 100%?
# How can you explain coding sequences whose length is not multiple of 3? (remember the rules of the genetic code)

5. Solution:

# minimum, maximum, average and median length of the UTRs of human transcripts (excluding values equal to 0)
# for 5' UTRs
filter_df = df_tt[df_tt.utr5_length>0]
print filter_df.utr5_length.min(), \
filter_df.utr5_length.max(), \
filter_df.utr5_length.mean(), \
filter_df.utr5_length.median()

# for 3' UTRs
filter_df = df_tt[df_tt.utr3_length>0]
print filter_df.utr3_length.min(), \
filter_df.utr3_length.max(), \
filter_df.utr3_length.mean(), \
filter_df.utr3_length.median()

# for both 5' and 5' UTRs (calculating the sum of lengths)
filter_df = df_tt[df_tt.utr5_length>0][df_tt.utr3_length>0]
sum_of_lengths = filter_df.utr5_length + filter_df.utr3_length
print sum_of_lengths.min(), \
sum_of_lengths.max(), \
sum_of_lengths.mean(), \
sum_of_lengths.median()

6. Solution:

# computes, for each transcript_biotype, the number of associated
#transcripts (a histogram), and prints the transcript_biotype with the
#number of associated transcripts in decreasing order
grouped = df_tt.groupby(df_tt.transcript_biotype)
grouped_number_by_biotype = grouped.size() # the result is a series
grouped_number_by_biotype.sort(ascending=0) # sort the series
print grouped_number_by_biotype

7. Solution:

# computes, for each transcript_biotype, the average transcript length, and prints the results in increasing order
grouped = df_tt.groupby(df_tt.transcript_biotype)
grouped_number_by_biotype = grouped.aggregate(pd.DataFrame.mean)
sorted_number_by_biotype = grouped_number_by_biotype.sort("transcript_length")
print sorted_number_by_biotype.transcript_length

8. Solution:

# computes, for protein_coding transcripts, the average length of the 5'UTR, CDS and 3' UTR
filter_df = df_tt[df_tt.transcript_biotype=="protein_coding"]
print filter_df.utr5_length.mean(), \
filter_df.cds_length.mean(), \
filter_df.utr3_length.mean()

9. Solution:

# computes, for each transcript_biotype and considering only canonical transcripts, the average number of exons
filter_df = df_tt[df_tt.canonical_flag=="T"]
grouped = filter_df.groupby(filter_df.transcript_biotype)
grouped_number_by_biotype = grouped.aggregate(pd.DataFrame.mean)
print grouped_number_by_biotype.exon_count

4. Solutions:

import pandas as pd
import matplotlib.pyplot as plt

1. Solution:

df_length = df_exon.exon_chrom_start - df_exon.exon_chrom_end + 1
print df_length.min(), df_length.max(), df_length.mean(), df_length.median()

2. Solution:

name = raw_input("write a trascript name :")

filtered = df_exon[df_exon.transcript_name == name]
filtered_length = filtered.exon_chrom_start - filtered.exon_chrom_end + 1
merged = df.merge(filtered, filtered_length)
merged.sort(["exon_chrom_start"])

5. Solutions:

ts = pd.read_csv("expression_timeseries.txt", sep="\t")

1. Solution:

# number of genes
print ts.shape

# number of time steps (just count how many columns end by 'h')
print len([c for c in ts.columns if c[-1] == "h"])

2. Solution:

# if there are NaNs, then dropna will remove at least one row
print "got nans?", ts.shape != ts.dropna().shape

# how many rows with at least one NaN are there? (just to double check)
print ts.shape - ts.dropna().shape

# fill in the missing values
ts = ts.ffill()


Note

In the following solutions we will assume that ffill() has already been called.

1. Solution: there are actually two expression profiles for the "ZFX" gene:

>>> expr_zfx = ts[ts["Gene Symbol"] == "ZFX"]
>>> expr_zfx
SPOT Gene Symbol     0h   0.5h     3h     6h    12h
0        1         ZFX -0.092  0.126  0.109  0.043  0.074
1739  1740         ZFX -0.487  0.298 -0.058 -0.158 -0.155


Let’s plot the first; the other can be dealt with similarly:

# first is a Series
first = expr_zfx.iloc
numbers = first[2:]
numbers.plot(kind="line")
plt.savefig("output.png")

2. Solution: let’s take the mean over the time steps:

means = ts[ts.columns[2:]].mean(axis=1)


Here the first part extracts only those columns that encode expression measurements (from the third onwards), while axis=1 specifies that the average should be taken by averaging over columns, rather than over rows as we are used to. The latter case corresponds to axis=0, and is the default.

Now:

variances = ts[ts.columns[2:]].var(axis=1)


does the same thing for the variance.

Let’s plot the mean and the variance:

means.plot(kind="line")
plt.savefig("means.png")

# clear the screen between plots
plt.clf()

# same thing for variance here


Warning

The above exercise is slightly bugged. In order to be solved “the Pandas way”, it requires more advanced material that is not provided by the notes. I gave a fully Pandas-approved solution.

1. Solution:

# we know that the only columns we care about are from the third
# onwards -- for simplicity
for column in ts.columns[2:]:
index_of_max = ts[column].argmax()
print column, ts.iloc[index_of_max]


Unfortunately most of the gene names are NaN; the solution is correct nevertheless.

2. Solution: same as below, except that the inner loop keeps track of the index of the maximally correlated gene.

3. Very elementary solution, it will take a long time to complete:

ts = pd.read_csv("expression_timeseries.txt", sep="\t")
num_rows = ts.shape # set this to 100 or so to make it faster

corr = []
for i in range(num_rows):
row_i = ts.iloc[i][2:].astype(float)
corr_i = []
for j in range(num_rows):
row_j = ts.iloc[j][2:].astype(float)
corr_i.append(row_i.corr(row_j))
corr.append(corr_i)

plt.imshow(pd.DataFrame(corr).values, cmap="gray", interpolation="nearest")
plt.savefig("corr.png")