## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# library(ORFik) # This package
# # Here the default values are shown:
# where_to_save_config <- config_file() # Bioc Cache directory default
#
# parent_folder <- "~/bio_data/" # Change if you want it somewhere else
# fastq.dir <- file.path(parent_folder, "raw_data") # Raw fast files
# bam.dir <- file.path(parent_folder, "processed_data") # Processed files
# reference.dir <- file.path(parent_folder, "references") # Genome references
# exp.dir <- file.path(base.dir, "ORFik_experiments/") # ORFik experiments
# config.save(where_to_save_config,
# fastq.dir, bam.dir, reference.dir, reference.dir)
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# config()
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# conf <- config.exper(experiment = "CHALMERS_Yeast", # short experiment info: here I used author + species
# assembly = "Yeast_SacCer3", # Reference folder
# type = c("RNA-seq")) # fastq and bam type
# conf
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
#
# info <- download.SRA.metadata("SRP012047", outdir = conf["fastq RNA-seq"])
# # Let's take 2 first runs in this experiment:
# info <- info[1:2,]
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# # Subset
# # 18 MB, ~ 40 sec download time ->
# download.SRA(info, conf["fastq RNA-seq"], subset = 50000)
#
# # Or you could do the full libraries
# # 1.6 GB, ~ 100 sec download time
# # (Full size files have faster MB/s download speed than subsets) ->
# # download.SRA(info, conf["fastq RNA-seq"])
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# organism <- info$ScientificName[1]
# is_paired_end <- all(info$LibraryLayout == "PAIRED")
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# annotation <- getGenomeAndAnnotation(
# organism = organism,
# output.dir = conf["ref"],
# optimize = TRUE,
# pseudo_5UTRS_if_needed = 100 # If species have not 5' UTR (leader) definitions, make 100nt pseudo leaders.
# )
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# annotation <- getGenomeAndAnnotation(
# organism = organism,
# genome = TRUE, GTF = TRUE,
# phix = TRUE, ncRNA = TRUE, tRNA = TRUE, rRNA = TRUE,
# output.dir = conf["ref"],
# assembly_type = "toplevel" # Usually leave this to "auto"
# )
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# gtf <- "/path/to/local/annotation.gtf"
# genome <- "/path/to/local/genome.fasta"
# makeTxdbFromGenome(gtf, genome, organism = "Saccharomyces cerevisiae")
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# txdb <- "/path/to/local/annotation.gtf.db"
# cage <- "/path/to/CAGE.ofst" # Can be bed, wig etc (ofst is fastest)
# reassigned_txdb <- assignTSSByCage(txdb, cage)
# AnnotationDbi::saveDb("/path/to/local/annotation_cage.db")
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# txdb <- "/path/to/local/annotation.gtf.db"
# reassigned_txdb <- assignTSSByCage(txdb, cage = NULL, pseudoLength = 100)
# AnnotationDbi::saveDb(reassigned_txdb,
# "/path/to/local/annotation_pseudo_leaders.db")
# leaders <- loadRegion(reassigned_txdb, "leaders")
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# index <- STAR.index(annotation)
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# alignment <-
# STAR.align.folder(conf["fastq RNA-seq"], conf["bam RNA-seq"], index,
# paired.end = is_paired_end,
# steps = "tr-ge", # (trim needed: adapters found, then genome)
# adapter.sequence = "auto",
# max.cpus = 30, trim.front = 3, min.length = 20)
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# dir.create("~/UMIandTrimmed")
# system(paste(install.fastp(),
# "-i", "~/sample1.fastq.gz",
# "-o", "~/UMIandTrimmed/sample1.UMIandTrimmed.fastq.gz"
# "--adapter_sequence=AGATCGGAAGAGC",
# "--umi",
# "--umi_loc", "read1",
# "--umi_len", 12))
# # Read fastp documentation for info about umi arguments
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# alignment <-
# STAR.align.folder(conf["fastq RNA-seq"], conf["bam RNA-seq"], index,
# paired.end = is_paired_end,
# steps = "tr", # (Only trim)
# adapter.sequence = "auto",
# max.cpus = 30, trim.front = 3, min.length = 20)
# trimmed_files <- file.path(conf["bam RNA-seq"], "trim")
# # Check that trimmed reads are average < 50 bases, else collapsing makes little sense for speedup.
# trim_table <- ORFik:::trimming.table(trimmed_files)
# stopifnot(all(trim_table$trim_mean_length < 50))
# trimmed_fastq <- list.files(path = trimmed_files, pattern = "fastq", full.names = TRUE)
# ORFik::collapse.fastq(files = trimmed_fastq, outdir = file.path(conf["bam Ribo-seq"], "collapsed") # Collapse all files in trim folder
# # Then align using collapsed reads
# alignment <-
# STAR.align.folder(trimmed_files, conf["bam RNA-seq"], index,
# paired.end = is_paired_end,
# steps = "ge", # (Only genome alignment from collapsed)
# max.cpus = 30, min.length = 20)
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# index <- STAR.index(annotation, max.ram = 20, SAsparse = 2)
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# STAR.align.folder(conf["fastq RNA-seq"], conf["bam RNA-seq"], index,
# max.cpus = 4) # Reduce cores to 4 usually works for most systems
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# STAR.align.folder(conf["fastq RNA-seq"], conf["bam RNA-seq"], index,
# max.cpus = 4, steps = "tr-ge", resume = "ge") # Resume ge using completed tr
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# txdb_file <- paste0(annotation["gtf"], ".db") # Get txdb file, not raw gtf
# fa <- annotation["genome"]
# create.experiment(exper = "yeast_exp_RNA",
# dir = paste0(conf["bam RNA-seq"], "/aligned/"),
# txdb = txdb_file,
# fa = fa,
# organism = organism,
# viewTemplate = FALSE,
# pairedEndBam = is_paired_end # True/False per bam file
# )
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# df <- read.experiment("yeast_exp_RNA")
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# QCreport(df)
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# remove.experiments(df) # Remove loaded libraries
# convertLibs(df, type = "ofst")
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# remove.experiments(df)
# convertLibs(df, type = "wig")
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# remove.experiments(df)
# outputLibs(df, type = "ofst")
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# mrna <- countTable(df, region = "mrna", type = "fpkm")
# cds <- countTable(df, region = "cds", type = "fpkm")
# ratio <- cds / mrna