项目作者: renatopuga

项目描述 :
RNASeq and Call Variants
高级语言: Shell
项目地址: git://github.com/renatopuga/rnaseq-callvariants.git
创建时间: 2021-03-02T18:24:36Z
项目社区:https://github.com/renatopuga/rnaseq-callvariants

开源协议:

下载


RNASeq and Call Variants

The data will be available after publication.

Machine

  • Ubuntu Desktop 20.04 LTS
  • 6 cpus
  • 32GB RAM Memory
  • 256GB SSD (/scratch) and 1TB HDD

Requirements

  • Docker version 19.03.8
  • STAR v2.7.5c
  • RSEM v1.3.1
  • human genome GRCh37.75
  • samtools 1.10
  • vcf-merge
  • VEP ensemble v102

Sample Information

  • TruSeq stranded mRNA 150 cycles
  • NextSeq 550
Lineage Phenotype
PK03 Controle
PK05 Controle
PK06 Controle
CQ16 Controle
PK01 PSP
PK02 PSP
PK04 PSP
PK07 PSP
PK08 PSP
PK09 PSP

References: Download and Compile

  1. # get reference genome fasta
  2. wget -c ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz
  3. # get refernece genome gtf
  4. wget -c ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz
  5. gunzip -v Homo_sapiens.GRCh37.75.gtf.gz
  6. gunzip -v Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz
  7. GTF="Homo_sapiens.GRCh37.75.gtf"
  8. FASTA="Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
  9. mkdir -p rsem
  10. # rsem: prepare reference
  11. /scratch/apps/RSEM/rsem-prepare-reference --gtf $GTF \
  12. --bowtie2 \
  13. $FASTA \
  14. rsem/rsem
  15. # caminho do executavel do programa STAR
  16. GENOMEDIR="/scratch/refs/release-75/"
  17. STAR="/scratch/apps/STAR/bin/Linux_x86_64/STAR"
  18. RSEM="/scratch/apps/RSEM"
  19. # 1/4 - Generating genome indexes
  20. $STAR --runThreadN 6 \
  21. --runMode genomeGenerate \
  22. --genomeDir $GENOMEDIR \
  23. --genomeFastaFiles $FASTA \
  24. --sjdbGTFfile $GTF \
  25. --sjdbOverhang 149

Mapping and counting (STAR and RSEM)

  1. ### STAR
  2. # paths
  3. GENOMEDIR="/scratch/refs/release-75"
  4. STAR="/scratch/apps/STAR/bin/Linux_x86_64/STAR"
  5. RSEM="/scratch/apps/RSEM"
  6. FASTA="$GENOMEDIR/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa "
  7. GTF="$GENOMEDIR/Homo_sapiens.GRCh37.75.gtf"
  8. # create a file with unique sample names
  9. ls -1d fastq/*/*.fastq.gz | grep -v "Undetermined" | sed -e "s/_L00[0-4]_R[12]_001.fastq.gz//g" | sort -Vu > FileSample
  10. # create directory for STAR output: RNASEQ_data
  11. mkdir -p RNASEQ_data
  12. mkdir -p ~/RNASEQ_data/
  13. # run STAR mapping and counting
  14. for f in `cat FileSample`;
  15. do
  16. ls $f/*/*R1*.fastq.gz | tr '\n' ',' | sed -s "s/\,$/ /g" > R1
  17. ls $f/*/*R2*.fastq.gz | tr '\n' ',' | sed -s "s/\,$/ /g" > R2
  18. fastqR1=$(cat R1)
  19. fastqR2=$(cat R2)
  20. mkdir "RNASEQ_data/$(basename $f)";
  21. $STAR --genomeDir $GENOMEDIR \
  22. --readFilesCommand zcat \
  23. --readFilesIn $fastqR1 $fastqR2 \
  24. --limitBAMsortRAM 8000000000 \
  25. --outSAMtype BAM SortedByCoordinate \
  26. --outSAMunmapped Within \
  27. --twopassMode Basic \
  28. --outFilterMultimapNmax 1 \
  29. --quantMode TranscriptomeSAM \
  30. --runThreadN 1 \
  31. --outFileNamePrefix "RNASEQ_data/$(basename $f)/";
  32. rm -f R1 R2
  33. mkdir "RNASEQ_data/rsem.$(basename $f)";
  34. $RSEM/rsem-calculate-expression --bam --no-bam-output -p 5 \
  35. --paired-end \
  36. --forward-prob 0 \
  37. RNASEQ_data/$(basename $f)/Aligned.toTranscriptome.out.bam \
  38. $GENOMEDIR/rsem/rsem \
  39. RNASEQ_data/rsem.$(basename $f)/rsem;
  40. #rm -rf "RNASEQ_data/$(basename $f)";
  41. #rm -f R1 R2
  42. rsync --progress -r "RNASEQ_data/$(basename $f)" ~/RNASEQ_data/$(basename $f)
  43. rm -rf "RNASEQ_data/$(basename $f)";
  44. done
  45. exit
  46. # by genes
  47. cd RNASEQ_data
  48. mkdir gene-level
  49. cd gene-level
  50. ls -d1 ../rsem.* | gawk '{print("ln -s",$1"/rsem.genes.results",gensub("../rsem.","","g",$1))}' | sh
  51. cd ..
  52. time R --file=../run.merge.files.R --args gene-level 5 gene-level-5
  53. # by isoforms
  54. mkdir transcript-level
  55. cd transcript-level
  56. ls -d1 ../rsem.* | gawk '{print("ln -s",$1"/rsem.genes.results",gensub("../rsem.","","g",$1))}' | sh
  57. cd ..
  58. time R --file=../run.merge.files.R --args transcript-level 5 transcript-level

EdgeR - Differential Gene Expression

  1. # Fonte: https://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
  2. # carregar a biblioteca edgeR
  3. library("edgeR")
  4. # table with gene count
  5. # First Column: Symbol
  6. # The remaining columns are the gene count
  7. TabelaCount <- "merge-table-STAR-gene-level-5-50x.csv"
  8. # loading the table (the first column is called Symbol)
  9. x <- read.delim(TabelaCount,row.names="symbol")
  10. # groups: 1 and 2
  11. # The first 4 samples group 1
  12. # 5 afterwards is group 2
  13. group <- factor(c(1,1,1,1,2,2,2,2,2))
  14. y <- DGEList(counts=x,group=group)
  15. # We filter out lowly expressed genes
  16. keep <- filterByExpr(y)
  17. y <- y[keep,,keep.lib.sizes=FALSE]
  18. # TMM normalization and display the normalization factors
  19. y <- calcNormFactors(y)
  20. png("plotMDS.png")
  21. plotMDS(y)
  22. dev.off()
  23. design <- model.matrix(~group)
  24. y <- estimateDisp(y,design)
  25. png("plotBCV.png")
  26. plotBCV(y)
  27. dev.off()
  28. #To perform likelihood ratio tests:
  29. fit <- glmFit(y,design)
  30. lrt <- glmLRT(fit,coef=2)
  31. topTags(lrt)
  32. png("plotMD.png")
  33. plotMD(lrt)
  34. abline(h=c(-1, 1), col="blue")
  35. dev.off()
  36. write.table(lrt$table,"lrt.DGElist.csv",quote=FALSE, sep="\t")

GATK4 - Call Variants

by https://github.com/gatk-workflows/gatk4-rnaseq-germline-snps-indels

  1. # scratch
  2. genome="/scratch/refs/release-75/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
  3. dbsnp="/scratch/bucket/dbsnp_138.b37.vcf.gz"
  4. intervals="/scratch/refs/release-75/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.interval_list"
  5. tmp="/data/tmp"
  6. sample=$1
  7. LB="RNASeq"
  8. PL="illumina"
  9. PU="NextSeq500"
  10. # listar amostras
  11. bamList=$(ls -1 */*/*.bam)
  12. for bam in $bamList
  13. do
  14. out=$(echo $bam | sed "s/\/.*//")
  15. # run MarkDuplicates
  16. docker run --user "$(id -u):$(id -g)" -it --rm -v /tmp:/tmp -v /scratch:/scratch -v $(pwd):/data broadinstitute/gatk:4.1.4.1 gatk --java-options "-Djava.io.tmpdir=${tmp} -Xmx8G -XX:+UseParallelGC -XX:ParallelGCThreads=8" MarkDuplicates \
  17. --TMP_DIR $tmp \
  18. -I /data/$bam -O /data/$out.sorted.dup.bam \
  19. -M /data/$out.sorted.dup_metrics \
  20. --VALIDATION_STRINGENCY SILENT \
  21. --CREATE_INDEX true \
  22. # run SplitNCigarReads
  23. docker run --user "$(id -u):$(id -g)" -it --rm -v /tmp:/tmp -v /scratch:/scratch -v $(pwd):/data broadinstitute/gatk:4.1.4.1 gatk SplitNCigarReads \
  24. -R $genome \
  25. -L $intervals_genes \
  26. -I /data/$out.sorted.dup.bam \
  27. -O /data/$out.sorted.dup.split.bam
  28. # run AddOrReplaceGroups
  29. docker run --user "$(id -u):$(id -g)" -it --rm -v /tmp:/tmp -v /scratch:/scratch -v $(pwd):/data broadinstitute/gatk:4.1.4.1 gatk AddOrReplaceReadGroups \
  30. -I /data/$out.sorted.dup.split.bam \
  31. -O /data/$out.sorted.dup.split.gp.bam \
  32. -ID $out \
  33. -LB $LB \
  34. -PL $PL\
  35. -PU $PU \
  36. -SM $out
  37. # run index ...gp.bam
  38. samtools index $out.sorted.dup.split.gp.bam
  39. # run BaseRecalibrator
  40. docker run --user "$(id -u):$(id -g)" -it --rm -v /scratch/:/scratch -v $(pwd):/data/ broadinstitute/gatk:4.1.4.1 gatk --java-options "-Xmx8G -XX:+UseParallelGC -XX:ParallelGCThreads=4" BaseRecalibrator \
  41. -L $intervals_genes \
  42. -R $genome \
  43. -I /data/$out.sorted.dup.split.gp.bam \
  44. --known-sites $dbsnp \
  45. -O /data/$out.sorted.dup.split.gp.recal.table
  46. # run ApplyBQSR
  47. docker run --user "$(id -u):$(id -g)" -it --rm -v /scratch:/scratch -v $(pwd):/data broadinstitute/gatk gatk --java-options "-Xmx8G -XX:+UseParallelGC -XX:ParallelGCThreads=8" ApplyBQSR \
  48. -R $genome \
  49. -I /data/$out.sorted.dup.split.gp.bam \
  50. -bqsr /data/$out.sorted.dup.split.gp.recal.table \
  51. -L $intervals_genes --create-output-bam-index true --static-quantized-quals 10 --static-quantized-quals 20 --static-quantized-quals 30 \
  52. -O /data/$out.sorted.dup.split.gp.recal.bam
  53. # run HaplotypeCaller
  54. docker run --user "$(id -u):$(id -g)" -it --rm -v /scratch:/scratch -v $(pwd):/data broadinstitute/gatk:4.1.4.1 gatk --java-options "-Xmx8G -XX:+UseParallelGC -XX:ParallelGCThreads=4" HaplotypeCaller \
  55. -R $genome \
  56. -I /data/$out.sorted.dup.split.gp.recal.bam \
  57. -L $intervals_genes \
  58. -dont-use-soft-clipped-bases \
  59. --standard-min-confidence-threshold-for-calling 20 \
  60. --native-pair-hmm-threads 8 \
  61. -O /data/$out.g.vcf.gz \
  62. --dbsnp $dbsnp
  63. # run VariantFiltration
  64. docker run --user "$(id -u):$(id -g)" -it --rm -v /scratch:/scratch -v $(pwd):/data/ broadinstitute/gatk:4.1.4.1 gatk VariantFiltration \
  65. --R $genome \
  66. --V /data/$out.g.vcf.gz \
  67. --window 35 \
  68. --cluster 3 \
  69. --filter-name "FS" \
  70. --filter "FS > 30.0" \
  71. --filter-name "QD" \
  72. --filter "QD < 2.0" \
  73. -O /data/$out.vf.vcf.gz
  74. done
  75. # run list vcfs
  76. vcfs=$(ls -1 *.vf.vcf.gz)
  77. for i in $vcfs
  78. do
  79. vcfList=${vcfList}" ${i}"
  80. done
  81. # run vcf-merge
  82. vcf-merge $vcfList > samples.vcf
  83. # create vep_data output
  84. mkdir vep_data
  85. chmod a+w vep_data
  86. # run vep annotation
  87. docker run -it --rm -v $(pwd):/data -v /scratch/:/scratch -v /vep/:/opt/vep/.vep ensemblorg/ensembl-vep ./vep \
  88. -i /data/samples.vcf \
  89. -o /data/vep_data/samples.vep.tsv -v \
  90. --fork 4 --cache --distance 0 --use_given_ref \
  91. --pick --pick_allele --no_intergenic --exclude_predicted \
  92. --no_stats --offline --force_overwrite --everything --tab \
  93. --fasta $genome \
  94. --individual all