如果某fq_clean文件的其中一端出现了错误,我们手里还持有他的原始数据,那我们就可以用以下方法处理
1,首先提取clean文件另一端的id,我用了python脚本
import gzip
import argparse
def extract_gene_names(input_fastq_gz, output_file):
with gzip.open(input_fastq_gz, 'rt') as fastq_gz, open(output_file, 'w') as out_file:
line_count = 0
for line in fastq_gz:
line_count += 1
if line_count % 4 == 1:
gene_name = line.strip().split()[0]
out_file.write(gene_name + '\n')
def main():
parser = argparse.ArgumentParser(description='Extract gene names from a gzipped fastq file.')
parser.add_argument('input_fastq_gz', help='Path to the input fastq.gz file')
parser.add_argument('output_file', help='Path to the output file containing gene names')
args = parser.parse_args()
extract_gene_names(args.input_fastq_gz, args.output_file)
if __name__ == '__main__':
main()
用法如下
usage: get_id_fqgz.py [-h] input_fastq_gz output_file
2,使用perl脚本根据这个id文件和原始数据生成所需的cleandata
use Bio::SeqIO;
use Bio::Seq;
use Term::ProgressBar;
if (@ARGV == 3) {
my ($id_file, $fq1_file, $out1_file) = @ARGV;
filter_fastq($id_file, $fq1_file, $out1_file);
} elsif (@ARGV == 5) {
my ($id_file, $fq1_file, $fq2_file, $out1_file, $out2_file) = @ARGV;
filter_fastq($id_file, $fq1_file, $out1_file, $fq2_file, $out2_file);
} else {
die "Usage: perl $0 <id> <fq1> [<fq2>] <OUT1> [<OUT2>]\n";
}
sub filter_fastq {
my ($id_file, $fq1_file, $out1_file, $fq2_file, $out2_file) = @_;
my %keep;
open my $ID, $id_file or die "Cannot open ID file: $id_file\n";
while (<$ID>) {
chomp;
next if /^#/;
my @tmp = split(/\s+/);
my ($cleaned_id) = $tmp[0] =~ /([^@]+)/; # 移除 "@" 符号
$keep{$cleaned_id} = 1;
}
close($ID);
process_fastq_file($fq1_file, $out1_file, \%keep, "Read 1");
if (defined $fq2_file && defined $out2_file) {
process_fastq_file($fq2_file, $out2_file, \%keep, "Read 2");
}
}
sub process_fastq_file {
my ($input_file, $output_file, $keep_ref, $read_label) = @_;
open my $FQ, "zcat $input_file |" or die "Cannot open input file: $input_file\n";
open my $GZ, "| gzip >$output_file" or die "Cannot open output file: $output_file\n";
my $fq_in = Bio::SeqIO->new(-fh => $FQ, -format => 'fastq');
my $fq_out = Bio::SeqIO->new(-fh => $GZ, -format => 'fastq');
my $total_seqs = `zcat $input_file | grep -c "^@"`; # Count total sequences in the input file
chomp($total_seqs);
my $progress = Term::ProgressBar->new({name => $read_label, count => $total_seqs, ETA => 'linear'});
$progress->minor(0); # Disabling minor updates for better performance
my $count = 0;
while (my $seq_obj = $fq_in->next_seq()) {
my $id = $seq_obj->id;
my ($cleaned_id) = $id =~ /([^@]+)/; # 移除 "@" 符号
if (exists $keep_ref->{$cleaned_id}) {
$fq_out->write_seq($seq_obj);
}
$count++;
$progress->update($count);
}
$progress->update($total_seqs); # Ensure the progress bar reaches 100%
close($FQ);
close($GZ);
}
没有开头那几个模块请自行百度安装,用法如下,【】中的是可选参数
Usage: perl raw_to_clean_byid.pl <id> <fq1> [<fq2>] <OUT1> [<OUT2>]
第二步的脚本优化自https://www.omicsclass.com/article/1267
如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!