项目作者: sirusb

项目描述 :
An HMM-based domain caller from bw
高级语言: R
项目地址: git://github.com/sirusb/HMMDomainCaller.git
创建时间: 2020-07-04T16:55:10Z
项目社区:https://github.com/sirusb/HMMDomainCaller

开源协议:MIT License

下载


Load required libraries

  1. require(mhsmm)
  2. require(BSgenome.Mmusculus.UCSC.mm10)
  3. require(rtracklayer)
  4. require(dplyr)
  5. require(glue)
  6. # for plotting genome browswer view (optional)
  7. require(Gviz)
  8. require(TxDb.Mmusculus.UCSC.mm10.knownGene)
  9. require(org.Mm.eg.db)
  10. # just for colorful plotting
  11. require(crayon)

Load the scripts

  1. source("utils.R")

Define the list of bw files

  1. bw_file_paths = list(H2K27me3_MII = "bigWig_examples/H3K27me3_MII_mm10.sorted.Q10.dedup.sorted.bw",
  2. H2K27me3_2C = "bigWig_examples/H3K27me3_2C_mm10.sorted.Q10.dedup.sorted.bw",
  3. H2K27me3_4C = "bigWig_examples/H3K27me3_4C_mm10.sorted.Q10.dedup.sorted.bw"
  4. )

Load the mm10 blacklist regions (optional)

  1. mm10_blacklist = "mm10-blacklist.v2.bed"
  2. mm10_blacklist_gr = data.table::fread(mm10_blacklist)
  3. colnames(mm10_blacklist_gr) = c("seqnames","start","end","Type")
  4. mm10_blacklist_gr = GRanges(mm10_blacklist_gr)
  5. mm10_blacklist_gr = subset(mm10_blacklist_gr, Type == "High Signal Region")
  6. head(mm10_blacklist_gr)
  1. ## GRanges object with 6 ranges and 1 metadata column:
  2. ## seqnames ranges strand | Type
  3. ## <Rle> <IRanges> <Rle> | <character>
  4. ## [1] chr10 0-3135400 * | High Signal Region
  5. ## [2] chr10 4613500-4615400 * | High Signal Region
  6. ## [3] chr10 4761300-4763900 * | High Signal Region
  7. ## [4] chr10 6281200-6286700 * | High Signal Region
  8. ## [5] chr10 6740200-6742100 * | High Signal Region
  9. ## [6] chr10 7396300-7429800 * | High Signal Region
  10. ## -------
  11. ## seqinfo: 21 sequences from an unspecified genome; no seqlengths

Call Domains

  1. H3K27me3_domains <- list()
  2. for(bw in names(bw_file_paths)){
  3. bw_file= bw_file_paths[[bw]]
  4. H3K27me3_domains[[bw]] = CallDomains(bw_file = bw_file,
  5. winsize = 5000,
  6. stepsize = 5000,
  7. training.chrom = glue("chr{1:4}"),
  8. chromsToUse = glue("chr{1:19}"),
  9. genome = 'mm10',
  10. smooth = FALSE,
  11. mm10_blacklist_gr = mm10_blacklist_gr,
  12. saveProbs = FALSE,
  13. plot.model = TRUE,
  14. outDir = "HMMDomains"
  15. )
  16. }
  1. ##
  2. ## ******** Processing H3K27me3_MII_mm10.sorted.Q10.dedup.sorted.bw ********
  3. ## *) Training model
  4. ## Estimating the initial distribution P0
  5. ## Initialization parameters
  6. ## $P0
  7. ## [1] 0.09994937 0.90005063
  8. ##
  9. ## $mu
  10. ## [1] 19.982045 1.578732
  11. ##
  12. ## $sigma
  13. ## [1] 9.991023 1.578732
  14. ##
  15. ## defining model

  1. ## Fitting model
  2. ## *) Detecting domains:
  3. ## Generating bed file
  4. ## ====================
  5. ## Saved in :HMMDomains/H3K27me3_MII_mm10.sorted.Q10.dedup.sorted_5_domains.bed
  6. ##
  7. ##
  8. ## ******** Processing H3K27me3_2C_mm10.sorted.Q10.dedup.sorted.bw ********
  9. ## *) Training model
  10. ## Estimating the initial distribution P0
  11. ## Initialization parameters
  12. ## $P0
  13. ## [1] 0.2079394 0.7920606
  14. ##
  15. ## $mu
  16. ## [1] 9.181240 1.643701
  17. ##
  18. ## $sigma
  19. ## [1] 4.590620 1.643701
  20. ##
  21. ## defining model

  1. ## Fitting model
  2. ## *) Detecting domains:
  3. ## Generating bed file
  4. ## ====================
  5. ## Saved in :HMMDomains/H3K27me3_2C_mm10.sorted.Q10.dedup.sorted_5_domains.bed
  6. ##
  7. ##
  8. ## ******** Processing H3K27me3_4C_mm10.sorted.Q10.dedup.sorted.bw ********
  9. ## *) Training model
  10. ## Estimating the initial distribution P0
  11. ## Initialization parameters
  12. ## $P0
  13. ## [1] 0.3265698 0.6734302
  14. ##
  15. ## $mu
  16. ## [1] 6.389107 1.593652
  17. ##
  18. ## $sigma
  19. ## [1] 2.880374 1.499496
  20. ##
  21. ## defining model

  1. ## Fitting model
  2. ## *) Detecting domains:
  3. ## Generating bed file
  4. ## ====================
  5. ## Saved in :HMMDomains/H3K27me3_4C_mm10.sorted.Q10.dedup.sorted_5_domains.bed

Check the output folder

  1. list.files("HMMDomains/",full.names = T)
  1. ## [1] "HMMDomains/H3K27me3_2C_mm10.sorted.Q10.dedup.sorted_5_domains.bed"
  2. ## [2] "HMMDomains/H3K27me3_4C_mm10.sorted.Q10.dedup.sorted_5_domains.bed"
  3. ## [3] "HMMDomains/H3K27me3_MII_mm10.sorted.Q10.dedup.sorted_5_domains.bed"

Visualize the detected domains

Let’s display the Hoxc locus as an example

  1. hoxc.gr = GRanges("chr15:100859671-104043685")
  1. txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
  2. TxByGns <- genes(txdb)
  3. TxByGns = subset(TxByGns, seqnames == seqlevels(hoxc.gr))
  4. TxByGns$gene_name <- mapIds(org.Mm.eg.db,
  5. keys = as.character(TxByGns$gene_id),
  6. column = "SYMBOL",
  7. keytype = "ENTREZID",
  8. multiVals = "first")
  9. TxDbTrack <- GeneRegionTrack(TxByGns,chromosome=seqlevels(hoxc.gr),gene = TxByGns$gene_name)
  10. gtrack <- GenomeAxisTrack()
  11. bw_tracks <- c(gtrack, TxDbTrack)
  12. seqlen = seqlengths(BSgenome.Mmusculus.UCSC.mm10)[seqlevels(hoxc.gr)]
  13. region <- GRanges(seqlevels(hoxc.gr), IRanges(1,seqlen))
  14. for(bw in names(bw_file_paths)){
  15. bw_file= bw_file_paths[[bw]]
  16. coverage <- import.bw(bw_file,which=region)
  17. dt <- DataTrack(coverage,chomosome=seqlevels(hoxc.gr),name = glue("{bw} signal"))
  18. bw_tracks = c(bw_tracks, dt)
  19. atrack <- AnnotationTrack(H3K27me3_domains[[bw]], name = glue("{bw} domains"),chromosome = seqlevels(hoxc.gr))
  20. bw_tracks = c(bw_tracks, atrack)
  21. }
  22. plotTracks(bw_tracks,
  23. chromosome=seqlevels(hoxc.gr),
  24. from = start(hoxc.gr),
  25. to = end(hoxc.gr),
  26. type="h",
  27. transcriptAnnotation="gene")