10 老师您好,我想利用gff文件和fa文件提取cds序列,请问应考虑哪些问题呢

我自己写的脚本总是提取不对,正链的每个mrna的第一个cds是对的,后面就不是atg开头了,负链就完全不对了,已经把负链互补过了,下面是我写的脚本,您能帮忙看看哪里出错了吗?attachments-2019-10-BK5RcqOn5d9c86a7ad2fe.pngattachments-2019-10-wNSvQUB85d9c86b61dfe3.pngattachments-2019-10-eIFeEglz5d9c86c235627.png

请先 登录 后评论

1 个回答

omicsgene - 生物信息
擅长:重测序,遗传进化,转录组,GWAS

fasta序列处理截取反向互补,建议用bioperl:

一般基因组都会提供基因的CDS序列,很少自己去截取。

代码参考如下:parse_gff  和 get_bed_cds 这个两个方法,你看看吧;

建议学习perl高级里面有讲解bioperl:perl入门到精通perl语言高级

#!/usr/bin/perl -w
use strict;
use Cwd qw(abs_path getcwd);
use Getopt::Long;
use Data::Dumper;
use FindBin qw($Bin $Script);
use File::Basename qw(basename dirname);
use Config::General;
use File::Path qw(make_path remove_tree);
use File::Copy;
use File::Spec;
my $BEGIN_TIME=time();
my $version="1.0";
use PerlIO::gzip;
use Bio::SeqIO;
my ($ref,$od,$query,$minL,$ref_label,$query_label,$prefix,$ref_gff,$query_gff,$cscore,$minspan,$minsize);
GetOptions(
                "help|?" =>\&USAGE,
"ref=s"=>\$ref,
"query=s"=>\$query,
"ref_gff=s"=>\$ref_gff,
"query_gff=s"=>\$query_gff,
"ref_label=s"=>\$ref_label,
"query_label=s"=>\$query_label,
"prefix=s"=>\$prefix,
"minL=s"=>\$minL,
"cscore=s"=>\$cscore,
"minspan=s"=>\$minspan,
"minsize=s"=>\$minsize,
"od=s"=>\$od,
                ) or &USAGE;
&USAGE unless ($ref and $query);
sub USAGE {
my $usage=<<"USAGE";
Program: $0
Discription:
usage:

Options:
-ref    Set the input reference multi-FASTA filename  required
-query    Set the input query multi-FASTA filename required
-ref_gff    Set the input reference gff  file required
-query_gff    Set the input query gff file required
-od         the outdir 
-ref_label  label ref genome name
-query_label  label query genome name
-prefix mcscan 
-minL    min chr length display,default 1000000
-cscore  C-score cutoff [default: 0.7]
-minspan  remove blocks with less span than this default 30.
-minsize  remove blocks with less number of anchors than this.default 0;
If you look closely, one of the 3 regions are often stronger, which corresponds to the orthologous 
regions between the two genomes. What if we just want to get the 1:1 orthologous region? 
We'll just repeat what we did but adding an option --cscore=.99. 
C-score is defined by the ratio of LAST hit to the best BLAST hits to either the query and hit. 
A C-score cutoff of .99 effectively filters the LAST hit to contain reciprocal best hit (RBH).
USAGE
print $usage;
exit 1;
}
$od||=getcwd;
$minL||=1000000;
$cscore||=0.7;
$prefix||="mcscan";
mkdir $od unless(-d $od);
$od=abs_path($od);
my@Color=qw(
#8dd3c7
#ffffb3
#bebada
#fb8072
#80b1d3
#fdb462
#b3de69
#fccde5
#d9d9d9
#bc80bd
#ccebc5
#ffed6f
#a6cee3
#1f78b4
#b2df8a
#33a02c
#fb9a99
#e31a1c
#fdbf6f
#ff7f00
#cab2d6
#6a3d9a
#ffff99
#b15928 )x100;
#my$work_sh="$od/work_sh";
#mkdir $work_sh unless(-d $work_sh);
$ref=abs_path($ref);
$query=abs_path($query);
$ref_gff=abs_path($ref_gff);
$query_gff=abs_path($query_gff);
$query_label||="Query";
$ref_label||="Ref";
$minspan||=30;
$minsize||=0;
my%ref_chr=();
my%query_chr=();
my%gene_chr_ref=();
my%gene_chr_query=();
my%chrColor=();
my%ref_gff=&parse_gff($ref_gff,\%ref_chr,\%gene_chr_ref);
my%query_gff=&parse_gff($query_gff,\%query_chr,\%gene_chr_query);
#Sprint Dumper(\%query_gff);
&get_bed_cds(\%ref_gff,$ref,"$od/$ref_label.cds","$od/$ref_label.bed");
#print Dumper(\%ref_gff);
#die;
&get_bed_cds(\%query_gff,$query,"$od/$query_label.cds","$od/$query_label.bed");


#####################################################################
sub parse_gff(){
my$gff=shift;
my$chr=shift;
my$gene_chr=shift;
my%gene=();
my%mRNA2Gene=();
open IN,"$gff" or die "$!";
while(<IN>){
chomp;
next if (/^#/);
my@tmp=split(/\t/);

if(exists $chr->{$tmp[0]}){
if($tmp[4]>$chr->{$tmp[0]}){
$chr->{$tmp[0]}=$tmp[4];
}
}else{
$chr->{$tmp[0]}=$tmp[4];
}

if($tmp[2] =~/^gene/){
#print "$tmp[2]\n";
if ($tmp[8]=~/biotype=([^;]+)/){
my($biot)=($tmp[8]=~/biotype=([^;]+)/);
next if($biot!~/protein/i);
}


my($id)=($tmp[8]=~/ID=([^;]+)/);
$gene{$id}{region}=[$tmp[0],$tmp[3],$tmp[4],$tmp[6]];
#print "gene:$id\n";
$gene_chr->{$id}=$tmp[0];
}
if($tmp[2] =~/mRNA|transcript/i){
my($id)=($tmp[8]=~/ID=([^;]+)/);
my($pid)=($tmp[8]=~/Parent=([^;]+)/);
if (! defined($pid)){
$pid=$id;
$gene{$id}{region}=[$tmp[0],$tmp[3],$tmp[4],$tmp[6]];
}

if(exists $gene{$pid}){
$gene{$pid}{$id}{region}=[$tmp[0],$tmp[3],$tmp[4],$tmp[6]];
$mRNA2Gene{$id}=$pid;
}

#print "mRNA:$id\n";
}
if($tmp[2] =~/utr/i){
my($pid)=($tmp[8]=~/Parent=([^;]+)/);

push @{$gene{$mRNA2Gene{$pid}}{$pid}{utr}},[$tmp[0],$tmp[3],$tmp[4],$tmp[6]] if (exists $mRNA2Gene{$pid} and exists $gene{$mRNA2Gene{$pid}});
}
if($tmp[2] =~/CDS/i){
my($pid)=($tmp[8]=~/Parent=([^;]+)/);
if  (exists $mRNA2Gene{$pid} and exists $gene{$mRNA2Gene{$pid}}){
push @{$gene{$mRNA2Gene{$pid}}{$pid}{cds}},[$tmp[0],$tmp[3],$tmp[4],$tmp[6]];
}elsif(exists $gene{$pid}){
push @{$gene{$pid}{notrans}{cds}},[$tmp[0],$tmp[3],$tmp[4],$tmp[6]];
}





}
#if($tmp[2] =~/exon/i){
#my($pid)=($tmp[8]=~/Parent=([^;]+)/);
#
#push @{$gene{$mRNA2Gene{$pid}}{$pid}{exon}},[$tmp[0],$tmp[3],$tmp[4],$tmp[6]] if (exists $mRNA2Gene{$pid} and exists $gene{$mRNA2Gene{$pid}});
#}
}
#print Dumper(\%gene);
return %gene;

}

sub get_bed_cds(){
my($gff,$ref_fa,$cds_out,$bed_out)=@_;
my%ref_fa=();
my$in  = Bio::SeqIO->new(-file => "$ref_fa" ,
                               -format => 'Fasta');
     while ( my $seq = $in->next_seq() ) {
            $ref_fa{$seq->id}=$seq;
        }
$in->close();

my$out  = Bio::SeqIO->new(-file => ">$cds_out" ,
                               -format => 'Fasta');
    open OUT ,">$bed_out" or die "$!";
    
    for my$geneID(keys %{$gff}){
    print OUT "$gff->{$geneID}->{region}->[0]\t$gff->{$geneID}->{region}->[1]\t$gff->{$geneID}->{region}->[2]\t$geneID\t0\t$gff->{$geneID}->{region}->[3]\n";
    my$strand=$gff->{$geneID}->{region}->[3];
    my$max_seq="";
    my$max_id="";
    my$seq="";
    for my$rnaID(keys %{$gff->{$geneID}}){
    next if $rnaID eq "region";
   
    #print "$geneID $rnaID\n";
    #print Dumper($gff->{$geneID}->{$rnaID});
    #die;
    my@cds;
    if(exists $gff->{$geneID}->{$rnaID}->{cds}){
    @cds=@{$gff->{$geneID}->{$rnaID}->{cds}};
    }else{
    print "gene not exists cds :$geneID\n";
    next;
    }
   
    #print "@cds\n";
    @cds=sort{$a->[1]<=>$b->[1]} @cds;
   
   
    for my $cds(@cds){
   
    if(exists $ref_fa{$cds->[0]}){
    $seq.=$ref_fa{$cds->[0]}->subseq($cds->[1],$cds->[2]);
    }else{
    print "not exists chr: $cds->[0] \n "; 
    }
   
    }
    if (length($seq)>length($max_seq)){
    $max_seq=$seq;
    $max_id=$rnaID;
    }
    }
   
    my$out_seq=Bio::Seq->new(-id=>"$geneID",-seq=>$max_seq,-desc=>$max_id);
   
    if ($strand eq '-'){
    $out->write_seq($out_seq->revcom);
    }else{
    $out->write_seq($out_seq);
    }
   
   
   
    }
    
close(OUT);
$out->close();

}






请先 登录 后评论

相似问题