from gff3 get gene fasta sequence(2)

from gff3 get gene fasta sequence ,or promoter sequence

#taken from https://www.biostars.org/p/46281/ and modified as needed 10/15/15
use strict;
use warnings;
use Bio::Seq;
use Bio::SeqIO;
use Bio::DB::Fasta;

#add a help message here
#my $num_args=$#ARGV + 1;
#if ($num_args != 4) {
#   print "\nUsage: gff2perl Genome.fasta Annotation.gff OutputPrefix \n\n";
#   exit;
#}

$| = 1;    # Flush output
my $outfile_cds = Bio::SeqIO->new( -format => ‘fasta‘, -file => ">$ARGV[2].cds.fasta" );
my $outfile_pep = Bio::SeqIO->new( -format => ‘fasta‘, -file => ">$ARGV[2].pep.fasta" );
my $outfile_cdna = Bio::SeqIO->new( -format => ‘fasta‘, -file => ">$ARGV[2].cdna.fasta" );
my $outfile_gene = Bio::SeqIO->new( -format => ‘fasta‘, -file => ">$ARGV[2].gene.fasta" );
my $outfile_upstream3000 = Bio::SeqIO->new( -format => ‘fasta‘, -file => ">$ARGV[2].upstream3000.fasta" );
my $outfile_exon = Bio::SeqIO->new( -format => ‘fasta‘, -file => ">$ARGV[2].exon.fasta");

###### Output type description ######
# cds - translated sequence (starting with ATG and ending with a stop codon included)
# cdna - transcribed sequence (devoid of introns, but containing untranslated exons)
# protein - cds translated (includes a * as the stop codon)
# gene - the entire gene sequence (including UTRs and introns)
# upstream3000 - the 3000 upstream region of the gene (likely including the promoter)

### First, index the genome
my $file_fasta = $ARGV[0];
my $db = Bio::DB::Fasta->new($file_fasta);
print ("Genome fasta parsed\n");

### Second, parse the GFF3
my %CDS;
my %CDNA;
my %EXON;
my $mRNA_name;
my $frame;
open GFF, "<$ARGV[1]" or die $!;
while ( my $line = <GFF> ) {
    chomp $line;
    my @array = split( "\t", $line );
    my $type = $array[2];

    if ($type eq ‘gene‘ || $type eq ‘mt_gene‘ ) {
        my @attrs = split( ";", $array[8] );
        $attrs[0] =~ s/ID=//;
        my $gene_name = $attrs[0];
        my $gene_start = $array[3];
        my $gene_end = $array[4];
        my $gene_seq = $db->seq( $array[0], $gene_start, $gene_end );
        my $output_gene = Bio::Seq->new(
            -seq        => $gene_seq,
            -id         => $gene_name,
            -display_id => $gene_name,
            -alphabet   => ‘dna‘,
        );

        # The upstream 3000
        my $upstream_start;
        my $upstream_end;
        if($array[6] eq ‘+‘) {
            $upstream_start=$gene_start-3000;
            $upstream_end=$gene_start-1;
        }
        elsif ($array[6] eq ‘-‘) {
            $upstream_start=$gene_end+1;
            $upstream_end=$gene_end+3000;
        }
        my $upstream_seq = $db->seq( $array[0], $upstream_start, $upstream_end );
        my $output_upstream3000 = Bio::Seq->new(
            -seq        => $upstream_seq,
            -id         => $gene_name."_upstream3000",
            -display_id => $gene_name."_upstream3000",
            -alphabet   => ‘dna‘,
        );

        # Reverse Complement if the frame is minus
        if($array[6] eq ‘+‘) {
        }
        elsif ($array[6] eq ‘-‘) {
            $output_gene = $output_gene->revcom();
            $output_upstream3000 = $output_upstream3000->revcom();
        }
        else {
            die "Unknown frame! At line $. of the GFF\n";
        }
    #added an if statement for all outputs requiring there to be sequence information before writing to file otherwise the fasta file contains lots of empty fasta headers
        if (length($gene_seq) != 0) {
    $outfile_gene->write_seq($output_gene);
    }
    if (length($upstream_seq) != 0) {
        $outfile_upstream3000->write_seq($output_upstream3000);
    }
    }

#CDS
    if ( ( $type eq ‘mRNA‘ || $type eq ‘transcript‘ ) and ( $. > 2 ) ) {
        # CDS: Collect CDSs and extract sequence of the previous mRNA
        my $mergedCDS_seq;
    # WARNING we must sort by $cds_coord[1]

        foreach my $key (sort {$a <=> $b} keys %CDS) { # Ascending numeric sort of the starting coordinate
            my $coord = $CDS{$key};
            my @cds_coord = split( " ", $coord );
            my $cds_seq = $db->seq( $cds_coord[0], $cds_coord[1], $cds_coord[2] );
            $mergedCDS_seq .= $cds_seq;
        }

        my $output_cds = Bio::Seq->new(
            -seq        => $mergedCDS_seq,
            -id         => $mRNA_name,
            -display_id => $mRNA_name,
            -alphabet   => ‘dna‘,
        );
        if ($frame eq ‘-‘) {
            $output_cds = $output_cds->revcom();
        }
    #translate CDS to peptide for protein sequence
        my $output_pep = $output_cds->translate();
    #write to file
    if (length($mergedCDS_seq) != 0) {
        $outfile_cds->write_seq($output_cds);
    }
    if (length($mergedCDS_seq) != 0) {
        $outfile_pep->write_seq($output_pep);
    }

#exons
#should be able to add exon output here since exons will be useful in gene models for other organisms can be added in the EVM program
        my $mergedEXON_seq;
        foreach my $key (sort {$a <=> $b} keys %EXON) { # Ascending numeric sort of the starting coordinatg
            my $coord = $EXON{$key};
            my @exon_coord = split( " ", $coord );
            my $exon_seq = $db->seq( $exon_coord[0], $exon_coord[1], $exon_coord[2] );
            $mergedEXON_seq .= $exon_seq;
        }

       my $output_exon = Bio::Seq->new(
            -seq        => $mergedEXON_seq,
            -id         => $mRNA_name,
            -display_id => $mRNA_name,
            -alphabet   => ‘dna‘,
        );
        if ($frame eq ‘-‘) {
            $output_exon = $output_exon->revcom();
        }
    #write to file
        if (length($mergedEXON_seq) != 0) {
        $outfile_exon->write_seq($output_exon);
        }

        # CDNA: Collect UTRs and CDSs and extract sequence of the previous mRNA
        my $mergedCDNA_seq;
        foreach my $key (sort {$a <=> $b} keys %CDNA) { # Ascending numeric sort of the starting coordinate
            my $coord = $CDNA{$key};
            my @cds_coord = split( " ", $coord );
            my $cds_seq = $db->seq( $cds_coord[0], $cds_coord[1], $cds_coord[2] );
            $mergedCDNA_seq .= $cds_seq;
        }

        my $output_cdna = Bio::Seq->new(
            -seq        => $mergedCDNA_seq,
            -id         => $mRNA_name,
            -display_id => $mRNA_name,
            -alphabet   => ‘dna‘,
        );
        if ($frame eq ‘-‘) {
            $output_cdna = $output_cdna->revcom();
        }
        if (length($mergedCDNA_seq) != 0) {
    $outfile_cdna->write_seq($output_cdna);
    }

        # Now initialize the next mRNA
        my @attrs = split( ";", $array[8] );
        $attrs[0] =~ s/ID=//;
        $mRNA_name = $attrs[0];
        $frame=$array[6];
        %CDS = (); %CDNA = (); # Empty the chunk arrays
    %EXON = (); %EXON = (); #Empty the EXON chunk arrays
    }
    elsif ( $type eq ‘mRNA‘ ) {    # First mRNA
        my @attrs = split( ";", $array[8] );
        $attrs[0] =~ s/ID=//;
        $mRNA_name = $attrs[0];
        $frame=$array[6];
    }
    elsif ( $type eq ‘CDS‘ ) {
        my $cds_coord = $array[0] . " " . $array[3] . " " . $array[4];
        $CDS{$array[3]}=$cds_coord;
        $CDNA{$array[3]}=$cds_coord;
    }
    elsif ($type eq ‘UTR‘ ) {
        my $utr_coord = $array[0] . " " . $array[3] . " " . $array[4];
        $CDNA{$array[3]}=$utr_coord;
    }
    elsif ($type eq ‘exon‘ ) {
    my $exon_coord = $array[0] . " " . $array[3] . " " . $array[4];
    $EXON{$array[3]}=$exon_coord;
    }
}

close GFF;

  

时间: 2024-10-29 10:08:21

from gff3 get gene fasta sequence(2)的相关文章

从gff3文件中获取fasta文件

chr1A NRGenome gene 1157233 1158291 . + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 chr1A NRGenome mRNA 1157233 1158291 . + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Name=TRIAE_CS42_U_TGACv1_6

How to prepare a FASTA file for calling SNP by GATK

1, Creating the fasta sequence dictionary file java -jar CreatSequenceDictionary.jar R=sequencename.fasta O=sequencename.dict 2,Creating the fasta index file samtools faidx sequencename.fasta

Annovar注释说明【转载自http://blog.csdn.net/u013816205/article/details/51262289】

ANNOVAR是一个perl编写的命令行工具,能在安装了perl解释器的多种操作系统上 执行.允许多种输入文件格式,包括最常被使用的VCF格式.输出文件也有多种格式,包括注释过的VCF文件.用tab或者逗号分隔的text文件. ANNOVAR能快速注释遗传变异并预测其功能.类似的variants注释软件还有 VEP, snpEff, VAAST, AnnTools等等. ANNOVAR支持三种不同形式的注释: gene-based, region-based 和filter-based. 这三种

annovar对人类基因组和非人类基因组variants注释流程

部分翻译:Hui Y, Kai W. Genomic variant annotation and prioritization with ANNOVAR and wANNOVAR[J]. Nature Protocols, 2015, 10(10). 此文只是用于作者和所有初接触annovar软件者分享交流.更深入学习请仔细阅读全文.转载请注明. ANNOVAR是一个perl编写的命令行工具,能在安装了perl解释器得多种操作系统上执行.允许多种输入文件格式,包括最常被使用的VCF格式.输出文

NCBI News

NCBI淘汰序列GI - 使用Accession.Version代替! 截至2016年9月,被称为"GI"的整数序列标识符将不再包括在NCBI支持的序列记录的GenBank,GenPept和FASTA格式中.FASTA标题将进一步简化,以便仅报告国际序列数据库协作(INSDC)和NCBI参考序列(RefSeq)项目管理的登录的序列登录号和记录标题. 当NCBI进行此转换时,我们鼓励具有依赖于GI的工作流的任何用户开始计划使用accession.version标识符.2016年9月之后,

snap

1.snap的下载与安装 snap的说明文档: /home/share/biosoft/snap/00README 下载: wget http://korflab.ucdavis.edu/Software/snap-2013-11-29.tar.gz 文件说明: DNA Contains some sample sequences HMM Contains SNAP parameter files LICENSE The GNU General Public License Makefile F

35、多重比对序列的格式及其应用

转载:http://boyun.sh.cn/bio/?p=1711 这里对多重序列比对格式(Multiple sequence alignment – MSA)进行总结.在做系统演化分析.序列功能分析.基因预测等,都需要涉及到多重序列比对.特别是当需要用不同软件对多重比对序列进行批量操作时,会遇到各种的格式,而这些格式是如何产生的,有什么区别,格式之间如何转换,从哪里可以下载到相关的格式序列,不同的格式又有什么特殊的用途等,本篇文章将就这些问题进行总结与讨论.因为涉及内容较多,不足之处,欢迎大家

MSA:多重比对序列的格式及其应用

多重比对序列的格式及其应用 这里对多重序列比对格式(Multiple sequence alignment – MSA)进行总结.在做系统演化分析.序列功能分析.基因预测等,都需要涉及到多重序列比对.特别是当需要用不同软件对多重比对序列进行批量操作时,会遇到各种的格式,而这些格式是如何产生的,有什么区别,格式之间如何转换,从哪里可以下载到相关的格式序列,不同的格式又有什么特殊的用途等,本篇文章将就这些问题进行总结与讨论.因为涉及内容较多,不足之处,欢迎大家补充或者批判. 生物信息学的基础是基于这

GFF3格式文件

GFF3是GFF注释文件的新标准.文件中每一行为基因组的一个属性,分为9列,以TAB分开. 依次是: 1. reference sequence:参照序列 指出注释的对象.如一个染色体,克隆或片段.可以有多个参照序列. 该id的取名不能以’>’开头,不能包含空格. 2. source :来源 注释的来源.如果未知,则用点(.)代替. 3. type      :类型 属性的类型.建议使用符合SO惯例的名称(sequence ontology,参看[[Sequence Ontology Proje