vlambda博客
学习文章列表

你不知道c语言写的程序要多块——以NGS fastq文件reads计数为例

本人 以fastq.gz文件计数为例分别以perl语言和c语言实现了代码,具体如下:

#!/usr/bin/perl -w
#
# Copyright (c) xuxiong 2020
# Writer: xuxiong <[email protected]><xuxiong\@me.com>
# Program Date: 2020.01.03
# Modifier: xuxiong <[email protected]><xuxiong\@me.cn>
# Last Modified: 2020.01.03
my $ver="0.0.1";

use strict;
use Getopt::Long;
use Data::Dumper;
use FindBin qw($Bin $Script);
use FileHandle;
use File::Basename qw(basename dirname);
use List::Util qw(first max maxstr min minstr reduce shuffle sum uniq);
use Cwd qw(abs_path getcwd realpath);
use Time::HiRes qw(time);

my $infile;
GetOptions(
"help|?" =>\&USAGE,
"i:s"=>\$infile
) or &USAGE;
&USAGE unless ($infile ) ;
###############Time_start#############################
my $Time_Start;
$Time_Start = sub_format_datetime(localtime(time()));
print STDERR "\nStart Time :[$Time_Start]\n\n";
my $BEGIN_TIME=time();
######################################################
my $read_count=0;
load_fastq($infile,\$read_count);
print STDERR "read count: ",$read_count,"\n";
sub load_fastq{
my ($infile,$n)=@_;
open(IN,$infile=~/\.gz$/ ? "gzip -cd $infile |" : $infile) || die "Can't open $infile\n";
my $read_num=0;
while (<IN>) {
last if(eof(IN));
$$n++;
<IN>;
<IN>;
<IN>;
}
close IN;
}

###############Time_end###########
my $Time_End;
$Time_End = sub_format_datetime(localtime(time()));
print STDERR "\nEnd Time :[$Time_End]\n\n";
print STDERR "Time elapsed :",time()-$BEGIN_TIME," s\n";
###############Sub_format_datetime
sub sub_format_datetime {#Time calculation subroutine
my($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = @_;
$wday = $yday = $isdst = 0;
sprintf("%4d-%02d-%02d %02d:%02d:%02d", $year+1900, $mon+1, $day, $hour, $min, $sec);
}

sub USAGE {#
my $usage=<<"USAGE";
Program: $0
Version: $ver
Contact: XiongXu <xuxiong\@me.com> <xuxiong19880610\@163.com>

Example:
perl $0
Description:
This program is used for counting load_fastq file

Usage:
-i infile required

USAGE
print STDERR $usage;
exit;
}
// gcc -g -O3 -Wall fastq_count_nomal.c -o fastq_count_nomal -lz
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <sys/time.h>
#include <time.h>
#include <zlib.h>
#include <fcntl.h>
//#include <unistd.h>

struct globalArgs_t {
const char *infile; /* -i option */
int verbosity; /* -v option */
} globalArgs;

void load_file(gzFile fp,unsigned long *Line_num);
static inline int readNextNode(gzFile fq,char *buf[4]) ;
void display_usage(char * argv[]);
static inline long long usec(void);
static gzFile open_input_stream(const char *filename);

long long usec(void){
struct timeval tv;
gettimeofday(&tv,NULL);
return (((long long)tv.tv_sec)*1000000)+tv.tv_usec;
}

void display_usage(char * argv[]){
char *buffer=(char* )malloc(10240*sizeof(char));
const char* usage=
"\nCopyright (c) 2020\n" \
"Contact: XiongXu <xuxiong@[email protected]> <[email protected]> \n" \
"Discription:\n This program is used for cutting the reads to get the specified cycles.\n" \
"Usage: %s [-i Infile] [-o OUTFILE] [-s start] [-e end] [-h] \n" \
"Example1:\n %s -i /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz -s 0 -e 100 -o out \n" \
"Example2:\n zcat /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz |%s -e 100 -o sss \n" \
"Example3:\n zcat /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz |%s -e 50 |less -S \n" \
"\n" \
" [-i Infile] = Infile.default is stdin [required]\n" \
" [-v ] = verbose mod [option]\n" \
" [-h] This helpful help screen. [option]\n" \
"\n";
sprintf(buffer,usage,argv[0],argv[0],argv[0],argv[0]);
fprintf(stderr,"%s",buffer);
free(buffer);
exit(1);
}

gzFile open_input_stream(const char *filename){
int fd ;
if (strncmp(filename,"-", 1)==0 || !strcmp(filename,"")) {
fd = STDIN_FILENO;
} else {
#if defined(__APPLE__) && defined(__MACH__) || defined(unix) || defined(linux)
fd = open(filename, O_CREAT | O_RDONLY, 0666 );
#else
fd = _open(filename, _O_CREAT|_O_BINARY | _O_RDONLY , 0666 );
#endif
if (fd==-1) fprintf(stderr, "Failed to create input file (%s)", filename);
}
gzFile in = gzdopen(fd,"rb");
return in;
}

int readNextNode(gzFile fq,char *buf[4]) {
int i=1;
buf[0]=gzgets(fq,buf[0],1024*sizeof(char));
if (!gzeof(fq)) {
buf[1]=gzgets(fq,buf[1],1024*sizeof(char));
buf[2]=gzgets(fq,buf[2],1024*sizeof(char));
buf[3]=gzgets(fq,buf[3],1024*sizeof(char));
if (globalArgs.verbosity) fprintf(stdout,"%s%s%s%s",buf[0],buf[1],buf[2],buf[3]);
} else {
return 0;
}
return i;
}

void load_file(gzFile fp,unsigned long *Line_num) {
long long begin,end;
begin=usec();
char *buf[4];
int i=0;
for(i=0;i<4;++i) {
buf[i] = (char *)calloc(1024,sizeof(char));
}
while (1) {
int status=readNextNode(fp,buf);
if (!status) break;
(*Line_num)++;
}
end=usec();
fprintf(stderr,"Total_reads: %lu\nFinished in %.3f s\n",*Line_num,(double)(end-begin)/CLOCKS_PER_SEC);
for(i=0;i<4;++i) {
free(buf[i]);
}
}

int main(int argc, char *argv[])
{
int opt = 0;
globalArgs.infile="-";
globalArgs.verbosity=0;
const char *optString = "i:vh?";
if (argc<2){
display_usage(argv);
exit(1);
}
opt = getopt( argc, argv, optString );
while( opt != -1 ) {
switch( opt ) {
case 'i':
globalArgs.infile = optarg;
break;
case 'v':
globalArgs.verbosity++;
break;
case '?': /* fall-through is intentional */
case 'h':
display_usage(argv);
break;
default:
fprintf(stderr,"error parameter!\n");
/* You won't actually get here. */
break;
}
opt = getopt( argc, argv, optString );
}
gzFile in=open_input_stream(globalArgs.infile);
unsigned long reads_num=0;
load_file(in,&reads_num);
gzclose(in);
return 0;
}

运行效果如下:

perl fastq_count.pl -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz

Start Time :[2020-01-14 12:30:10]

read count: 47861176

End Time :[2020-01-14 12:31:59]

Time elapsed :109.049475908279 s
./fastq_count_normal -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz 
Total_reads: 47861176
Finished in 66.111 s

可见c写的比perl的足足快了43 s,要知道Illumina/BGI的测序仪目前下机的格式其实都是BGZF(一种兼容gzip的可并行压缩/解压格式),本人调用htslib中的bgzf,用c语言实现的工具采用6个线程运行的效果如下:

./fastq_count_bgzip -t 6 -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz 
Total_reads: 47861176
Finished in 14.588 s

欢迎参与讨论,转发3个以上专业群可获取fastq_count_bgzip.c 源代码