=================
Pandas: Solutions
=================

#. Solutions::

    import pandas as pd
    import matplotlib.pyplot as plt
    df_iris = pd.read_csv("iris.csv")

   #. Solution::

       # number of rows and columns
       print df_iris.shape

       # only rows
       print len(df_iris.index)

       # only columns
       print len(df_iris.columns)

   #. Solution::

       # average petal length
       print df_iris.PetalLength.mean()

   #. Solution::

       # average of all numeric columns
       print df_iris.mean()

   #. Solution::

       # rows corresponding to petal length outliers
       print df_iris[df_iris.PetalLength > 1.5*df_iris.PetalLength.mean()]

   #. Solution::

       # group-wise standard deviation
       grouped = df_iris.groupby(df_iris.Name)
       iris_std_by_name = grouped.aggregate(pd.DataFrame.std)
       print iris_std_by_name

       # or in a single row
       print df_iris.groupby(df_iris.Name).aggregate(pd.DataFrame.std)

   #. Solution::

       # grouped outliers
       for name, dataframe in df_iris.groupby(df_iris.Name):
           print dataframe[dataframe.PetalLength > 1.5*df_iris.PetalLength.mean()]

   #. Solution::

       # WARNING: there are *no* group-wise outliers!

       # group-wise outliers
       for name, dataframe in df_iris.groupby(df_iris.Name):
           print dataframe[dataframe.PetalLength > 1.5*dataframe.PetalLength.mean()]

       # using groupby-aggregate-merge
       grouped = df_iris.groupby(df_iris.Name, as_index=False)
       petlen_mean_by_name = grouped.aggregate(pd.DataFrame.mean)
       merged = pd.merge(iris, petlen_mean_by_name, on="Name")
       # use `print merged.columns` to take a peek at the column names; the
       # column_x columns come from the left table (i.e. the original iris
       # DataFrame), the column_y ones come # from the right table (i.e. the
       # group-wise means)
       print merged[merged.PetalLength_x > 1.5 * merged.PetalLength_y]

#. Solutions::

    import pandas as pd
    import matplotlib.pyplot as plt
    df_gene = pd.read_csv("gene_table.txt")

   #. Solution::

       # number of genes (there is exactly one per row)
       print df_gene.shape[0]

   #. Solution::

       # minimum, maximum, average and median number of isoforms per gene
       print df_gene.transcript_count.min(), \
             df_gene.transcript_count.max(), \
             df_gene.transcript_count.mean(), \
             df_gene.transcript_count.median()

   #. Solution::

       # plot a histogram of the number of known isoforms per gene
       df_gene.transcript_count.plot(kind="hist",bins=100)
       plt.show()

   #. Solution::

       # computes, for each gene_biotype, the number of associated genes
       grouped = df_gene.groupby(df_gene.gene_biotype)
       grouped_number_by_biotype = grouped.aggregate(pd.DataFrame.count)
       print grouped_number_by_biotype

   #. Solution::

       rows = []
       for biotype, subdf in df_gene.groupby(df_gene.gene_biotype):
           rows.append((len(subdf), biotype))

       # alternative 1
       rows.sort()
       rows.reverse()
       print rows

       # alternative 2, more advanced
       print sorted(rows, reverse=True)

   #. Solution::

       print len(df_gene.chromosome.unique())

       # or equivalently (the result is a numpy array)
       print df_gene.shape[0]

   #. Solution: almost identical to the solution of point 5.

   #. Solution::

       for chromosome, subdf in df_gene.groupby(df_gene.chromosome):
           num_plus = float(len(subdf[subdf.strand == "+"]))
           perc_plus = 100 * num_plus / len(subdf)
           print chromosome, perc_plus

   #. Solution::

        df_gene.groupby(df_gene.gene_biotype, as_index=False) \
               .aggregate(pd.DataFrame.mean)[['gene_biotype', 'transcript_count']]

#. Solutions::

    import pandas as pd
    import matplotlib.pyplot as plt
    df_tt = pd.read_csv("transcript_table.txt")

   #. Solution::

       print df_tt.shape[0]

   #. Solution::
	  
        # minimum, maximum, average and median length of human transcripts
        print df_tt.transcript_length.min(), \
              df_tt.transcript_length.max(), \
              df_tt.transcript_length.mean(), \
              df_tt.transcript_length.median()

   #. Solution::
	  
        # minimum, maximum, average and median length of the CDS of human transcripts (excluding values equal to 0)
        print (df_tt.cds_length[df_tt.cds_length>0]).min(), \
              (df_tt.cds_length[df_tt.cds_length>0]).max(), \
              (df_tt.cds_length[df_tt.cds_length>0]).mean(), \
              (df_tt.cds_length[df_tt.cds_length>0]).median()

   #. Solution::
	  
        # computes the percentage of human transcripts with a CDS length that is a multiple of 3 (excluding values equal to 0)
        cds_number = df_tt.cds_length[df_tt.cds_length>0].shape[0]
        cds_multiple_number = df_tt.cds_length[df_tt.cds_length>0][df_tt.cds_length%3==0].shape[0]
        print (float(cds_multiple_number)/float(cds_number))*100
        # Is the percentage equal to 100%?
        # How can you explain coding sequences whose length is not multiple of 3? (remember the rules of the genetic code)
                
   #. Solution::
	  
        # minimum, maximum, average and median length of the UTRs of human transcripts (excluding values equal to 0)
        # for 5' UTRs
        filter_df = df_tt[df_tt.utr5_length>0]	
        print filter_df.utr5_length.min(), \
              filter_df.utr5_length.max(), \
              filter_df.utr5_length.mean(), \
              filter_df.utr5_length.median()
		
        # for 3' UTRs
        filter_df = df_tt[df_tt.utr3_length>0]	
        print filter_df.utr3_length.min(), \
              filter_df.utr3_length.max(), \
              filter_df.utr3_length.mean(), \
              filter_df.utr3_length.median()

        # for both 5' and 5' UTRs (calculating the sum of lengths)
        filter_df = df_tt[df_tt.utr5_length>0][df_tt.utr3_length>0]
        sum_of_lengths = filter_df.utr5_length + filter_df.utr3_length
        print sum_of_lengths.min(), \
              sum_of_lengths.max(), \
              sum_of_lengths.mean(), \
              sum_of_lengths.median()

   #. Solution::

        # computes, for each transcript_biotype, the number of associated
        #transcripts (a histogram), and prints the transcript_biotype with the
        #number of associated transcripts in decreasing order
        grouped = df_tt.groupby(df_tt.transcript_biotype)
        grouped_number_by_biotype = grouped.size() # the result is a series
        grouped_number_by_biotype.sort(ascending=0) # sort the series
        print grouped_number_by_biotype
	  
	  
   #. Solution::
	  
        # computes, for each transcript_biotype, the average transcript length, and prints the results in increasing order
        grouped = df_tt.groupby(df_tt.transcript_biotype)
        grouped_number_by_biotype = grouped.aggregate(pd.DataFrame.mean)
        sorted_number_by_biotype = grouped_number_by_biotype.sort("transcript_length")
        print sorted_number_by_biotype.transcript_length	

   #. Solution::

        # computes, for protein_coding transcripts, the average length of the 5'UTR, CDS and 3' UTR
        filter_df = df_tt[df_tt.transcript_biotype=="protein_coding"]	
        print filter_df.utr5_length.mean(), \
              filter_df.cds_length.mean(), \
              filter_df.utr3_length.mean()

   #. Solution::
	  
        # computes, for each transcript_biotype and considering only canonical transcripts, the average number of exons
        filter_df = df_tt[df_tt.canonical_flag=="T"]	
        grouped = filter_df.groupby(filter_df.transcript_biotype)
        grouped_number_by_biotype = grouped.aggregate(pd.DataFrame.mean)
        print grouped_number_by_biotype.exon_count	  

#. Solutions::

    import pandas as pd
    import matplotlib.pyplot as plt
    df_exon = pd.read_csv("exon_table.txt")

   #. Solution::

       df_length = df_exon.exon_chrom_start - df_exon.exon_chrom_end + 1
       print df_length.min(), df_length.max(), df_length.mean(), df_length.median()

   #. Solution::

       name = raw_input("write a trascript name :")

       filtered = df_exon[df_exon.transcript_name == name]
       filtered_length = filtered.exon_chrom_start - filtered.exon_chrom_end + 1
       merged = df.merge(filtered, filtered_length)
       merged.sort(["exon_chrom_start"])


#. Solutions::

    ts = pd.read_csv("expression_timeseries.txt", sep="\t")

   #. Solution::

       # number of genes
       print ts.shape[0]

       # number of time steps (just count how many columns end by 'h')
       print len([c for c in ts.columns if c[-1] == "h"])

   #. Solution::

       # if there are NaNs, then dropna will remove at least one row
       print "got nans?", ts.shape[0] != ts.dropna().shape[0]

       # how many rows with at least one NaN are there? (just to double check)
       print ts.shape[0] - ts.dropna().shape[0]

       # fill in the missing values
       ts = ts.ffill()

   .. note::
       In the following solutions we will assume that ffill() has already been
       called.

   #. Solution: there are actually *two* expression profiles for the ``"ZFX"``
      gene::

       >>> expr_zfx = ts[ts["Gene Symbol"] == "ZFX"]
       >>> expr_zfx
             SPOT Gene Symbol     0h   0.5h     3h     6h    12h
       0        1         ZFX -0.092  0.126  0.109  0.043  0.074
       1739  1740         ZFX -0.487  0.298 -0.058 -0.158 -0.155

      Let's plot the first; the other can be dealt with similarly::

       # first is a Series
       first = expr_zfx.iloc[0]
       numbers = first[2:]
       numbers.plot(kind="line")
       plt.savefig("output.png")

   #. Solution: let's take the mean over the time steps::

       means = ts[ts.columns[2:]].mean(axis=1)

      Here the first part extracts only those columns that encode expression
      measurements (from the third onwards), while ``axis=1`` specifies that
      the average should be taken by averaging over *columns*, rather than
      over *rows* as we are used to. The latter case corresponds to ``axis=0``,
      and is the default.

      Now::

       variances = ts[ts.columns[2:]].var(axis=1)

      does the same thing for the variance.

      Let's plot the mean and the variance::

       means.plot(kind="line")
       plt.savefig("means.png")

       # clear the screen between plots
       plt.clf()

       # same thing for variance here

   .. warning::
      The above exercise is slightly bugged. In order to be solved "the Pandas
      way", it requires more advanced material that is not provided by the
      notes. I gave a fully Pandas-approved solution.

   #. Solution::

       # we know that the only columns we care about are from the third
       # onwards -- for simplicity
       for column in ts.columns[2:]:
           index_of_max = ts[column].argmax()
           print column, ts.iloc[index_of_max]

      Unfortunately most of the gene names are ``NaN``; the solution is
      correct nevertheless.

   #. Solution: same as below, except that the inner loop keeps track of the
      index of the maximally correlated gene.

   #. Very elementary solution, it will take a **long** time to complete::

       ts = pd.read_csv("expression_timeseries.txt", sep="\t")
       num_rows = ts.shape[0] # set this to 100 or so to make it faster

       corr = []
       for i in range(num_rows):
           row_i = ts.iloc[i][2:].astype(float)
           corr_i = []
           for j in range(num_rows):
               row_j = ts.iloc[j][2:].astype(float)
               corr_i.append(row_i.corr(row_j))
           corr.append(corr_i)

       plt.imshow(pd.DataFrame(corr).values, cmap="gray", interpolation="nearest")
       plt.savefig("corr.png")

#. Left to the reader.