你不知道c语言写的程序要多块——以NGS fastq文件reads计数为例
时间:2022-07-23
本文章向大家介绍你不知道c语言写的程序要多块——以NGS fastq文件reads计数为例,主要内容包括其使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。
本人 以fastq.gz文件计数为例分别以perl语言和c语言实现了代码,具体如下:
#!/usr/bin/perl -w
#
# Copyright (c) xuxiong 2020
# Writer: xuxiong <xuxiong19880610@163.com><xuxiong@me.com>
# Program Date: 2020.01.03
# Modifier: xuxiong <xuxiong19880610@163.com><xuxiong@me.cn>
# Last Modified: 2020.01.03
my $ver="0.0.1";
use strict;
use Getopt::Long;
use Data::Dumper;
use FindBin qw($Bin $Script);
use FileHandle;
use File::Basename qw(basename dirname);
use List::Util qw(first max maxstr min minstr reduce shuffle sum uniq);
use Cwd qw(abs_path getcwd realpath);
use Time::HiRes qw(time);
my $infile;
GetOptions(
"help|?" =>&USAGE,
"i:s"=>$infile
) or &USAGE;
&USAGE unless ($infile ) ;
###############Time_start#############################
my $Time_Start;
$Time_Start = sub_format_datetime(localtime(time()));
print STDERR "nStart Time :[$Time_Start]nn";
my $BEGIN_TIME=time();
######################################################
my $read_count=0;
load_fastq($infile,$read_count);
print STDERR "read count: ",$read_count,"n";
sub load_fastq{
my ($infile,$n)=@_;
open(IN,$infile=~/.gz$/ ? "gzip -cd $infile |" : $infile) || die "Can't open $infilen";
my $read_num=0;
while (<IN>) {
last if(eof(IN));
$$n++;
<IN>;
<IN>;
<IN>;
}
close IN;
}
###############Time_end###########
my $Time_End;
$Time_End = sub_format_datetime(localtime(time()));
print STDERR "nEnd Time :[$Time_End]nn";
print STDERR "Time elapsed :",time()-$BEGIN_TIME," sn";
###############Sub_format_datetime
sub sub_format_datetime {#Time calculation subroutine
my($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = @_;
$wday = $yday = $isdst = 0;
sprintf("%4d-%02d-%02d %02d:%02d:%02d", $year+1900, $mon+1, $day, $hour, $min, $sec);
}
sub USAGE {#
my $usage=<<"USAGE";
Program: $0
Version: $ver
Contact: XiongXu <xuxiong@me.com> <xuxiong19880610@163.com>
Example:
perl $0
Description:
This program is used for counting load_fastq file
Usage:
-i infile required
USAGE
print STDERR $usage;
exit;
}
// gcc -g -O3 -Wall fastq_count_nomal.c -o fastq_count_nomal -lz
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <sys/time.h>
#include <time.h>
#include <zlib.h>
#include <fcntl.h>
//#include <unistd.h>
struct globalArgs_t {
const char *infile; /* -i option */
int verbosity; /* -v option */
} globalArgs;
void load_file(gzFile fp,unsigned long *Line_num);
static inline int readNextNode(gzFile fq,char *buf[4]) ;
void display_usage(char * argv[]);
static inline long long usec(void);
static gzFile open_input_stream(const char *filename);
long long usec(void){
struct timeval tv;
gettimeofday(&tv,NULL);
return (((long long)tv.tv_sec)*1000000)+tv.tv_usec;
}
void display_usage(char * argv[]){
char *buffer=(char* )malloc(10240*sizeof(char));
const char* usage=
"nCopyright (c) 2020n"
"Contact: XiongXu <xuxiong@19880610@163.cn> <xiongxu@me.com> n"
"Discription:n This program is used for cutting the reads to get the specified cycles.n"
"Usage: %s [-i Infile] [-o OUTFILE] [-s start] [-e end] [-h] n"
"Example1:n %s -i /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz -s 0 -e 100 -o out n"
"Example2:n zcat /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz |%s -e 100 -o sss n"
"Example3:n zcat /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz |%s -e 50 |less -S n"
"n"
" [-i Infile] = Infile.default is stdin [required]n"
" [-v ] = verbose mod [option]n"
" [-h] This helpful help screen. [option]n"
"n";
sprintf(buffer,usage,argv[0],argv[0],argv[0],argv[0]);
fprintf(stderr,"%s",buffer);
free(buffer);
exit(1);
}
gzFile open_input_stream(const char *filename){
int fd ;
if (strncmp(filename,"-", 1)==0 || !strcmp(filename,"")) {
fd = STDIN_FILENO;
} else {
#if defined(__APPLE__) && defined(__MACH__) || defined(unix) || defined(linux)
fd = open(filename, O_CREAT | O_RDONLY, 0666 );
#else
fd = _open(filename, _O_CREAT|_O_BINARY | _O_RDONLY , 0666 );
#endif
if (fd==-1) fprintf(stderr, "Failed to create input file (%s)", filename);
}
gzFile in = gzdopen(fd,"rb");
return in;
}
int readNextNode(gzFile fq,char *buf[4]) {
int i=1;
buf[0]=gzgets(fq,buf[0],1024*sizeof(char));
if (!gzeof(fq)) {
buf[1]=gzgets(fq,buf[1],1024*sizeof(char));
buf[2]=gzgets(fq,buf[2],1024*sizeof(char));
buf[3]=gzgets(fq,buf[3],1024*sizeof(char));
if (globalArgs.verbosity) fprintf(stdout,"%s%s%s%s",buf[0],buf[1],buf[2],buf[3]);
} else {
return 0;
}
return i;
}
void load_file(gzFile fp,unsigned long *Line_num) {
long long begin,end;
begin=usec();
char *buf[4];
int i=0;
for(i=0;i<4;++i) {
buf[i] = (char *)calloc(1024,sizeof(char));
}
while (1) {
int status=readNextNode(fp,buf);
if (!status) break;
(*Line_num)++;
}
end=usec();
fprintf(stderr,"Total_reads: %lunFinished in %.3f sn",*Line_num,(double)(end-begin)/CLOCKS_PER_SEC);
for(i=0;i<4;++i) {
free(buf[i]);
}
}
int main(int argc, char *argv[])
{
int opt = 0;
globalArgs.infile="-";
globalArgs.verbosity=0;
const char *optString = "i:vh?";
if (argc<2){
display_usage(argv);
exit(1);
}
opt = getopt( argc, argv, optString );
while( opt != -1 ) {
switch( opt ) {
case 'i':
globalArgs.infile = optarg;
break;
case 'v':
globalArgs.verbosity++;
break;
case '?': /* fall-through is intentional */
case 'h':
display_usage(argv);
break;
default:
fprintf(stderr,"error parameter!n");
/* You won't actually get here. */
break;
}
opt = getopt( argc, argv, optString );
}
gzFile in=open_input_stream(globalArgs.infile);
unsigned long reads_num=0;
load_file(in,&reads_num);
gzclose(in);
return 0;
}
运行效果如下:
perl fastq_count.pl -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz
Start Time :[2020-01-14 12:30:10]
read count: 47861176
End Time :[2020-01-14 12:31:59]
Time elapsed :109.049475908279 s
./fastq_count_normal -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz
Total_reads: 47861176
Finished in 66.111 s
可见c写的比perl的足足快了43 s,要知道Illumina/BGI的测序仪目前下机的格式其实都是BGZF(一种兼容gzip的可并行压缩/解压格式),本人调用htslib中的bgzf,用c语言实现的工具采用6个线程运行的效果如下:
./fastq_count_bgzip -t 6 -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz
Total_reads: 47861176
Finished in 14.588 s
- 爬取 Stackoverflow 100 万条问答并简单分析
- Python: 早点知道这些就不会这样了
- 第一个参数:initScans(job)
- Mysql 高可用 InnoDB Cluster 多节点搭建过程
- 用MapReduce分析Hbase将结果插入mysql中
- 敲敲级简单的鉴别H图片的小程序
- processArgs方法解读
- Python分布式微博爬虫(源码分享)
- NewInstallUserMapper.class阅读笔记
- TimeUtil类所有方法
- mapreduce项目调优
- zookeeper强制关机无法启动Unable to load database on disk具体解决步骤
- Python爬淘宝——300W淘宝文胸说明了什么
- 文本编码转换工具iconv 附批量转换文件编码命令
- JavaScript 教程
- JavaScript 编辑工具
- JavaScript 与HTML
- JavaScript 与Java
- JavaScript 数据结构
- JavaScript 基本数据类型
- JavaScript 特殊数据类型
- JavaScript 运算符
- JavaScript typeof 运算符
- JavaScript 表达式
- JavaScript 类型转换
- JavaScript 基本语法
- JavaScript 注释
- Javascript 基本处理流程
- Javascript 选择结构
- Javascript if 语句
- Javascript if 语句的嵌套
- Javascript switch 语句
- Javascript 循环结构
- Javascript 循环结构实例
- Javascript 跳转语句
- Javascript 控制语句总结
- Javascript 函数介绍
- Javascript 函数的定义
- Javascript 函数调用
- Javascript 几种特殊的函数
- JavaScript 内置函数简介
- Javascript eval() 函数
- Javascript isFinite() 函数
- Javascript isNaN() 函数
- parseInt() 与 parseFloat()
- escape() 与 unescape()
- Javascript 字符串介绍
- Javascript length属性
- javascript 字符串函数
- Javascript 日期对象简介
- Javascript 日期对象用途
- Date 对象属性和方法
- Javascript 数组是什么
- Javascript 创建数组
- Javascript 数组赋值与取值
- Javascript 数组属性和方法
- ES对磁盘的要求都有哪些,大部分你可能不知道
- docker通过模板创建镜像以及容器、仓库和数据管理
- Form表单类组件与Map地图组件
- 转录组分析 | 使用Hisat2进行序列比对
- 最后一个页面:构建电影详情页面
- 安装docker以及通过容器创建镜像
- 转录组分析 | 使用trim-galore去除低质量的reads和adaptor
- 设计模式之工厂方法模式
- Ubuntu上安装TensorFlow(python2.7版)
- 转录组分析 | fastqc进行质控与结果解读
- playbook管理配置文件
- 使用playbook安装nginx
- 第五个页面:更多电影页面
- 云原生|Operator本质
- 简单使用ansible-playbook