反馈方式:
- 本文的任何错误,请在留言中指正;也可发邮件至[email protected],欢迎交流;
- 对于任何关于新功能的建议,也可按上一步交流;
本程序待改进地方:
- 想着,在运行程序的同时,程序会将自身复制一份到输出文件夹用于备份(current_file_path_getter);但是该函数的可移植性很差,暂时无法识别以R CMD方式运行该脚本,但是通过source("")和“R --file=”方式运行时没有问题的;也请有更好方法的牛人不吝赐教,谢谢先;
- 期望在每一步完成后,脚本自动发邮件到指定邮箱提醒或告知用户;但是,“sendmailR”这个程序包,不太会用,研究中;
- 该脚本,写得臭长臭长的,很多地方可以更简练;之所以这么长,主要是个人看着清晰;
脚步说明:
- 本脚本可以免费、自由使用;
- 本脚本目前只使用于单端测序;
使用方法:
- 安装必要的软件,这个应该不是问题的;主要有Tophat,Cufflinks,FastQC等;
- 新建一个工作目录,并将脚步的“working_directory”设置为该新建的工作目录;
- 在工作目录下,新建“input_files”夹;
- 在“input_files”文件夹下,新建“sequenced_reads”文件夹。并将且仅将所有的测序数据文件放入该文件夹下;
- 根据样本的物种,在“input_files”文件夹,新建以物种命名的文件夹(例“Homo_sapiens”),同时将脚本中的“specie_name”设置为物种名。
- 在以物种命名的文件夹(Homo_sapiens)下,新建用于存放Bowtie2 index文件的文件夹“Bowtie2Index”和用于存放基因注释的“Genes”文件夹;
- 到IGenomes主页下载感兴趣的对应物种的参考文件和注释(Reference Sequences and Annotations)。并将iGenomes压缩包中对应的文件放入上一步建立的文件中。
- 根据所运行的系统选择相应的方法,运行该脚本;
- 运行约一两分钟后,本脚本会在工作目录下生成“output_files_××××××××_××××××”文件夹,进入该文件夹找到“accepted_hits_bam.txt”文件,按以下步骤编辑(考虑到不同机器的运行速度,更适合编辑该文件的时机是,“output_files_××××××××_××××××”文件夹下的“tophat_output”文件夹不为空):
- 在文件开头插入一新行,输入:“sample_id group_label”(注,不包括引号,中间已制表符tab分割;
- 在其他行后,添加一个制表符tab,然后输入改行所对应样本的标签(label,用于分组),组内不同重复请使用相同的标签;
- 保存该文件。
- 等待机器运行完成,就好了;
- 使用cummeRbund进行下游分析;
代码如下:
#! /data02/software/R/bin/R ## Created by Roger Young, 2015-02-05 ## nohup /data02/software/R/bin/R --file=~/mRNA_workflow.R &> ./nohup.log ## nohup /data02/software/R/bin/R CMD BATCH ~/mRNA_workflow.R &> ./nohup.log ##-------Those lines should be changed accordingly----------------------------- ## 正常情况下,需要根据需要更改以下几行:output_folder和output_folder; ## directory: 代表一个目录的绝对路径, 以"/"结尾;另有说明的除外! ## foler: 代表一个文件夹的文件名,以"/"结尾;另有说明的除外! ## path: 代表一个文件的绝对路径;另有说明的除外! DEBUG_FLAG <- FALSE; ## R路径: /data02/software/R/bin/R ## source("~/Documents/mRNA_profiles/mRNA_workflow.R") working_directory <- "~/Documents/mRNA_profiles/"; ## 如果在Linux系统下测试,请酌情修改。 ## C:\Program Files\R\R-3.1.2\bin\i386\R.exe --file=~/mRNA_workflow.R if(DEBUG_FLAG){ working_directory <- "D:/roger/data/mRNA/"; } output_folder <- paste("output_files_", format(Sys.time(), "%Y%m%d_%I%M%S"),"/", sep=""); output_directory <- paste(working_directory, output_folder, sep =""); dir.create(path = output_directory, recursive = TRUE, showWarnings = FALSE); ##-------所用到的软件目录,需根据具体设备更改------------------------------------------------ fastqc_path <- "~/bin/FastQC/fastqc"; tophat_app_path <- "tophat"; cufflinks_app_path <- "cufflinks"; cuffmerge_app_path <- "cuffmerge"; cuffdiff_app_path <- "cuffdiff"; ## Parameters used by all those softwares! threads_params <- "-p 10"; debug_flag <- " DEBUG FLAG: "; ##-------输出文件夹,一般无需修改---------------------------------------------------- log_folder <- "log/"; log_directory <- paste(output_directory, log_folder,sep = ""); dir.create(path = log_directory, recursive = TRUE, showWarnings = FALSE); sink(file = paste(log_directory, "log_", format(Sys.time(), "%Y%m%d_%I%M%S"), ".log", sep=""), split = TRUE); ##-----获取本脚本的路径,并保存在输出文件夹中(注,尚需修改以保证可移植性)-------------- current_file_path_getter <- function() { cmdArgs <- commandArgs(trailingOnly = FALSE); print(paste("cmdArgs: ",paste(commandArgs(), collapse = " "))); R_CMD_indicator <- "[[:blank:]]CMD[[:blank:]]"; CMD_MODE_matched <- grep(pattern = R_CMD_indicator, x = cmdArgs); if(length(CMD_MODE_matched) > 0){ }else{ file_arg_indicator <- "--file="; matched <- grep(pattern = file_arg_indicator, x = cmdArgs); if (length(matched) > 0) { # Rscript return(normalizePath(sub(pattern = file_arg_indicator, replacement = "", x = cmdArgs[matched]))); } else { # ‘source‘d via R console return(normalizePath(sys.frames()[[1]]$ofile)); } } } current_script_path <- current_file_path_getter(); print(paste("current_script_path: ", current_script_path)); file.copy(from = current_file_path_getter(), to = paste(output_directory,"/RScript.R",sep = ""), overwrite = TRUE, copy.mode = TRUE, copy.date = TRUE); ##-------定义输入文件和文件夹----------------------------------------------- input_folder <- "input_files/"; input_directory <- paste(working_directory, input_folder, sep = ""); sequenced_reads_folder <- "sequenced_reads"; ## 没有"/" sequenced_reads_directory <- paste(input_directory, sequenced_reads_folder, sep = ""); ##-------Genome References----------------------------------------------------- specie_name <- "Homo_sapiens/"; annotation_file_name <- "genes.gtf"; annotation_file_folder <- "Genes/"; annotation_index_directory <- paste(input_directory, specie_name, annotation_file_folder , sep = ""); annotation_index_path <-paste(annotation_index_directory, annotation_file_name, sep = ""); bowtie_index_name <- "genome"; bowtie_index_folder <- "Bowtie2Index/"; bowtie_index_directory <- paste(input_directory, specie_name, bowtie_index_folder , sep = ""); bowtie_index_path <-paste(bowtie_index_directory, bowtie_index_name, sep = ""); ##-------用于确认所依赖的包已正确安装------------------------------------------ check_bioconductor_pkg <- function(pkgs) { if(require(pkgs , character.only = TRUE)){ print(paste("******", pkgs,"is loaded correctly", "******")); } else { print(paste("******Trying to install ", pkgs, "******")); ## Update R packages print("******Update R packages******"); update.packages(ask = FALSE); source("http://bioconductor.org/biocLite.R"); biocLite(); ## Install desired Packages biocLite(pkgs); ## Re-check if(require(pkgs, character.only = TRUE)){ print(paste("******",pkgs, "installed and loaded!", "******")); } else { stop(paste("??????","Error: could not install",pkgs, "??????")); } } } check_pkg <- function(pkgs) { if(require(pkgs , character.only = TRUE)){ print(paste("******", pkgs,"is loaded correctly", "******")); } else { print(paste("******Trying to install ", pkgs, "******")); # ## Update R packages print("******Update R packages******"); update.packages(ask = FALSE); install.packages(pkgs); ## Re-check if(require(pkgs, character.only = TRUE)){ print(paste("******",pkgs, "installed and loaded!", "******")); } else { stop(paste("??????????","Error: could not install", pkgs, "??????????")); } } } ## (check_pkg("sendmailR")); ## (check_bioconductor_pkg("cummeRbund")); ##-------获取所需处理的测序文件列表------------------------------------------ (sequenced_files <- list.files(path = sequenced_reads_directory, pattern = "*[^(.md5)|(.txt)]$", all.files = TRUE, full.names = TRUE, recursive = TRUE)); ##-------测序质量评估-------------------------------- (fastqc_output_folder <- "fastqc_output/"); (fastqc_output_directory <- paste(output_directory, fastqc_output_folder,sep = "")); (dir.create(path = fastqc_output_directory, recursive = TRUE, showWarnings = FALSE)); (sequenced_file_lists <- sequenced_files); print(paste("sequenced_file_lists: ", sequenced_file_lists)); for (files_to_test in sequenced_file_lists){ print(paste("files_to_test: ", files_to_test)); fastqc_params <- "--outdir="; fastqc_output_directory; ## Tedious lines for output folder construction temp_output_directory_dot_repaced <- gsub(pattern = "\\.", replacement = "_", x = files_to_test); fastqc_output_directory <- gsub(pattern = paste(input_folder, sequenced_reads_folder, "/", sep=""), ## specical replacement = paste(output_folder, fastqc_output_folder, sep = ""),, x = temp_output_directory_dot_repaced); print(paste(debug_flag, "fastqc_output_directory: ", fastqc_output_directory)); dir.create(fastqc_output_directory, recursive = TRUE); ## 构建用于运行fastqc的命令 fastqc_cmd <- paste(fastqc_path, files_to_test, paste( fastqc_params, fastqc_output_directory, sep = ""), sep = " "); print(paste(debug_flag,"fasqc_cmd: ", fastqc_cmd)); ##质量评估与后续部分无联系,所以在后台运行,以提高速度! if(!DEBUG_FLAG){ print("Running fastqc!") system(command = fastqc_cmd, wait = FALSE); } else { print("In Debug mode!") } } ##-------Align the RNA-seq reads to the genomes-------------------------------- ## Map the reads for each sample to the reference genome (tophat_output_folder <- "tophat_output/"); (tophat_output_directory <- paste(output_directory, tophat_output_folder, sep = "")); (dir.create(path = tophat_output_directory, recursive = TRUE, showWarnings = FALSE)); (cufflinks_output_folder <- "cufflinks_output/"); (cufflinks_output_directory <- paste(output_directory, cufflinks_output_folder, sep = "")); (dir.create(path = cufflinks_output_directory, recursive = TRUE, showWarnings = FALSE)); (sequenced_file_lists <- sequenced_files); (cuffmerge_assemblies_file <- "assemblies.txt") (cuffmerge_assemblies_file_text <- ""); (cuffmerge_assemblies_file_path <- paste(output_directory, cuffmerge_assemblies_file, sep = "")); (accepted_hits_bam_file <- "accepted_hits_bam.txt") (accepted_hits_bam_file_text <- ""); (accepted_hits_bam_file_path <- paste(output_directory, accepted_hits_bam_file, sep = "")); ## 该部分用于构建cuffdiff所需要用的accepted_hits_bam_file_path文件; ## 该部分运行完后需要按要求修改accepted_hits_bam_file_path文件。 for(sequenced_read in sequenced_file_lists){ print(paste(debug_flag, "The file to process: ", sequenced_read)); temp_output_directory_dot_repaced <- gsub(pattern = "\\.", replacement = "_", x = sequenced_read); tophat_output_directory <- gsub(pattern = paste(input_folder, sequenced_reads_folder, "/", sep=""), ## specical replacement = paste(output_folder, tophat_output_folder, sep = ""),, x = temp_output_directory_dot_repaced); accepted_hits_bam_file_text <- paste(accepted_hits_bam_file_text, tophat_output_directory, "/", "accepted_hits.bam", "\n", sep = ""); cufflinks_output_directory <- gsub(pattern = paste(input_folder, sequenced_reads_folder, "/", sep=""), ## specical replacement = paste(output_folder, cufflinks_output_folder, sep = ""),, x = temp_output_directory_dot_repaced); cuffmerge_assemblies_file_text <- paste(cuffmerge_assemblies_file_text, cufflinks_output_directory, "/", "transcripts.gtf", "\n", sep = ""); } write(cuffmerge_assemblies_file_text, file = cuffmerge_assemblies_file_path); write(accepted_hits_bam_file_text, file = accepted_hits_bam_file_path); accepted_hits_bam_file_modify_time <- file.info(accepted_hits_bam_file_path); ## 给用户发邮件,告知accepted_hits_bam_file_path文件已建立,请修改; print(paste("Please edit file: ", accepted_hits_bam_file_path)); print(paste("First, insert the following line (seperated by tab): \"sample_id group_label\";")); print("The, Append each other lines with a tab, followed by the sample‘s label (group)") print("Finally, Save the file!!") number_of_files_had_processed <- 0; ##length(sequenced_file_lists); print(paste(debug_flag, "number_of_files_had_processed: ", number_of_files_had_processed)); for(sequenced_read in sequenced_file_lists){ print(paste(debug_flag, "The file to process: ", sequenced_read)); temp_output_directory_dot_repaced <- gsub(pattern = "\\.", replacement = "_", x = sequenced_read); tophat_output_directory <- gsub(pattern = paste(input_folder, sequenced_reads_folder, "/", sep=""), ## specical replacement = paste(output_folder, tophat_output_folder, sep = ""),, x = temp_output_directory_dot_repaced); print(paste(debug_flag, "Tophat output folder: ",tophat_output_directory)); (dir.create(path = tophat_output_directory, recursive = TRUE, showWarnings = FALSE)); ##构建运行tophat的命令 (tophat_cmd <- paste(tophat_app_path, threads_params, "-G", annotation_index_path, "-o", tophat_output_directory, bowtie_index_path, sequenced_read, sep = " ")); print(paste(debug_flag, "tophat_cmd: ",tophat_cmd)); if(!DEBUG_FLAG){ system(command = tophat_cmd, wait = TRUE); } ## 构建运行cufflinks的命令 cufflinks_output_directory <- gsub(pattern = paste(input_folder, sequenced_reads_folder, "/", sep=""), ## specical replacement = paste(output_folder, cufflinks_output_folder, sep = ""),, x = temp_output_directory_dot_repaced); print(paste(debug_flag, "cufflinks output directory", cufflinks_output_directory)) (dir.create(path = cufflinks_output_directory, recursive = TRUE, showWarnings = FALSE)); (cufflinks_cmd <- paste(cufflinks_app_path, threads_params, "-o", cufflinks_output_directory, paste(tophat_output_directory,"/", "accepted_hits.bam", sep = ""), sep = " ")); print(paste(debug_flag, "cufflinks_cmd: ",cufflinks_cmd)); ##同步运行cufflinks! number_of_files_had_processed <- number_of_files_had_processed + 1; print(paste(debug_flag, "Number of files had been handled: ", number_of_files_had_processed)); if(!DEBUG_FLAG){ system(command = cufflinks_cmd, wait = TRUE); } } warnings() ##-------Run Cuffmerge to create a single merged transcriptome annotation------ (cuffmerge_output_folder <- "cuffmerge_output"); ##没有“/” (cuffmerge_output_directory <- paste(output_directory, cuffmerge_output_folder, sep = "")); dir.create(path = cuffmerge_output_directory, recursive = TRUE, showWarnings = FALSE); cuffmerge_cmd <- paste(cuffmerge_app_path, "--ref-gtf", annotation_index_path, "-o", cuffmerge_output_directory, "--ref-sequence", paste(bowtie_index_directory,"genome.fa", sep = ""), threads_params, cuffmerge_assemblies_file_path, sep = " "); print(paste(debug_flag, "cuffmerge_cmd: ",cuffmerge_cmd)); if(!DEBUG_FLAG){ system(command = cuffmerge_cmd, wait = TRUE); } ##-------Identify differentially expressed genes and transcripts------ (cuffdiff_output_folder <- "cuffdiff_output/"); (cuffdiff_output_directory <- paste(output_directory, cuffdiff_output_folder, sep = "")); (dir.create(path = cuffdiff_output_directory, recursive = TRUE, showWarnings = FALSE)); cuffdiff_cmd <- paste(cuffdiff_app_path, "-use-sample-sheet", "-o", cuffdiff_output_directory, "-b", paste(bowtie_index_directory,"genome.fa", sep = ""), threads_params, "-u", paste(cuffmerge_output_directory,"/","merged.gtf",sep=""), accepted_hits_bam_file_path, sep=" ") print(paste(debug_flag, "cuffdiff_cmd: ",cuffdiff_cmd, sep = "")); if(DEBUG_FLAG){ Sys.sleep(time = 20); } ## 此处,我们认为用户已正确修改了accepted_hits_bam_file_path所指代的文件 if(identical(file.info(accepted_hits_bam_file_path), accepted_hits_bam_file_modify_time)){ print(paste("This file stays unchanged: ", accepted_hits_bam_file_path, sep = "")); print("cuffdiff is skipped! Please run it manually!!"); print(paste("The cmd for running cuffdiff is: ",cuffdiff_cmd, sep = "")); } else { print("cuffdiff_cmd is running"); if(!DEBUG_FLAG){ system(command = cuffdiff_cmd, wait = TRUE,); } } print("Done!"); sink(); print("Done!"); if(!DEBUG_FLAG){ quit(save = "no"); }
完成!
时间: 2024-10-06 17:56:17