TPM (transcripts per million) calculation

This tutorial covers transcripts per million calculation with library size normalization using edgeR. This tutorial was constructed for The Laboratory of Cell Systems, Institute for Protein Research, Suita, Osaka, Japan.

prep our library

library(tidyverse)
library(data.table)
library(edgeR)
## Warning: package 'edgeR' was built under R version 3.6.1

let’s read our raw counts data

counts <- read_csv(file = "D:/lab_rmd/tpm/counts_trimmed.csv")

Here is how the data should look like

head(counts) 
## # A tibble: 6 x 10
##   Geneid          Chr               Start                                      End                                       Strand            Length B00_A01 B00_A02 B00_A03 B00_A04
##   <chr>           <chr>             <chr>                                      <chr>                                     <chr>              <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 ENSGALG0000005… 1;1;1;1;1;1;1;1;… 5273;5273;5536;6755;6755;6755;8674;8674;8… 5524;5524;5952;6829;6829;6829;8775;8904;… -;-;-;-;-;-;-;-;…   2018       0       0       0       0
## 2 ENSGALG0000005… 1;1;1;1;1         9441;9681;9768;10182;10256                 10053;9683;10070;14305;14305              +;+;+;+;+           4754       0       0       0       0
## 3 ENSGALG0000004… 1;1;1;1;1         27209;32230;32540;33081;33287              27503;32331;32721;33187;33555             +;+;+;+;+            955     105      73      35      42
## 4 ENSGALG0000005… 1;1;1;1;1         31439;32230;53984;54190;242817             31484;32331;54090;54263;243537            +;+;+;+;+           1050     285     220     124     230
## 5 ENSGALG0000004… 1;1;1;1;1;1;1;1;1 39057;39080;39967;39967;40433;40433;40823… 39867;39867;40073;40073;40924;40614;4092… -;-;-;-;-;-;-;-;-   2161     224     138     134     178
## 6 ENSGALG0000004… 1                 58427                                      58617                                     +                    191       0       0       0       0

Remove Chr, Start, End, Strand. Only keep geneid, length and samples

counts <- counts[,!colnames(counts) %in% c("Chr", "Start", "End", "Strand")]
head(counts)
## # A tibble: 6 x 6
##   Geneid             Length B00_A01 B00_A02 B00_A03 B00_A04
##   <chr>               <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 ENSGALG00000054818   2018       0       0       0       0
## 2 ENSGALG00000053455   4754       0       0       0       0
## 3 ENSGALG00000045540    955     105      73      35      42
## 4 ENSGALG00000051297   1050     285     220     124     230
## 5 ENSGALG00000042023   2161     224     138     134     178
## 6 ENSGALG00000047594    191       0       0       0       0

Divide the sample read count by the length of the gene multiplied by 1000

counts_tpm <- (counts[,3:ncol(counts)]/counts$Length) *1000
#return to tible just because I like tible
counts_tpm <- cbind(Geneid = counts$Geneid, counts_tpm) %>% as_tibble()

Get the normalization factor (in this case RLE)

normfactor <- DGEList(counts = counts_tpm[,2:ncol(counts_tpm)],
                        group = colnames(counts_tpm[,2:ncol(counts_tpm)])
                        )

#you can change the normalization method here
normfactor <- calcNormFactors(normfactor, method="RLE")
  
normfactor_samples <- normfactor$samples
  
#multiply normalization factor with the library size
normfactor_samples$normlib <- normfactor_samples$lib.size*normfactor_samples$norm.factors

Loop to get the final TPM counts table

for(i in 1:(dim(counts_tpm)[2]-1)){
    
  counts_tpm[,i+1] <- (counts_tpm[,i+1]/normfactor_samples$normlib[i])*1000000
    
}
head(counts_tpm)
## # A tibble: 6 x 5
##   Geneid             B00_A01 B00_A02 B00_A03 B00_A04
##   <fct>                <dbl>   <dbl>   <dbl>   <dbl>
## 1 ENSGALG00000054818     0       0       0       0  
## 2 ENSGALG00000053455     0       0       0       0  
## 3 ENSGALG00000045540    54.6    49.4    37.3    33.1
## 4 ENSGALG00000051297   135.    135.    120.    165. 
## 5 ENSGALG00000042023    51.5    41.3    63.1    61.9
## 6 ENSGALG00000047594     0       0       0       0
 

A work by Johannes Nicolaus Wibisana

johannes.nicolaus@gmail.com