今回は、RNA-seqデータ解析(対話モード版)で行なった処理を毎回対話モードで行うのは面倒なので、バッチスクリプトを作成して一括で行います。
以下のパラメタファイル(params.json
)を読み込んで各種操作を行います。
{
"qsub_q": "u-debug",
"n_node": 1,
"n_cpus": 1,
"n_MPIprocs": 1,
"n_OMPthreads": 8,
"walltime": "0:30:00",
"group_list": "gj29",
"ID": "ERR315326",
"RNAseqData": "ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/sralite/ByExp/litesra/ERX/ERX288/ERX288491/ERR315326/ERR315326.sra",
"MappingRefGenome": "https://cloud.biohpc.swmed.edu/index.php/s/grch38/download",
"GeneAnnotation": "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.annotation.gtf.gz"
}
各変数は、 jq
コマンドを使って以下のように読み込みます。
[変数名]=$(cat [パラメタファイルへのパス] | jq -r .[key名])
[変数名]=`jq -r .[key名] [パラメタファイルへのパス]`
成果物
make_batch_u_qsub.sh
という以下のシェルスクリプトを叩くことで、params.json
の内容を踏まえたバッチスクリプト batch_u_qsub.sh
を作成します。
make_batch_u_qsub.sh
[download]
#!/bin/bash
Ssymbol="["
Esymbol="]"
function AssignParams () {
<< COMMENTOUT
@params params.json {"qsub_q": "u-debug"}
@params templates.sh PBS -q [${n_node}]
@return batch_u_qsub.sh PBS -q u-debug
COMMENTOUT
line="${1}"
while true
do
Spos=`echo "${line}" | awk -v pattern=${Ssymbol} '{print index($0, pattern)}'`
Epos=`echo "${line}" | awk -v pattern=${Esymbol} '{print index($0, pattern)}'`
if [ ${Spos} -ne 0 -a ${Epos} -ne 0 -a ${Spos} -lt ${Epos} ]; then
PREFIX=`echo "${line:0:$(($Spos-1))}"`
VARIABLE=`echo "${line}" | cut -c "$(($Spos+1))-$(($Epos-1))"`
CONTENT=`eval echo ${VARIABLE}`
SUFFIX=`echo "${line:$Epos}"`
line="${PREFIX}${CONTENT}${SUFFIX}"
else
break
fi
done
echo "${line}" >> $OUTPUT_FILE
}
echo -n "Path to params file (.json): "
read PARAMS_FILE
# Convert each element of json to a variable.
KEYS=`jq -r 'keys[]' $PARAMS_FILE`
for key in $KEYS; do
eval $key=`jq -r .${key} $PARAMS_FILE`
done
OUTPUT_FILE="${ID}/batch_u_qsub.sh"
if [ ! -d ${ID} ]; then
mkdir ${ID}
fi
echo -n "Path to templates file (.sh): "
read TEMPLATES_FILE
PRE_IFS=$IFS
IFS=$'\n'
for line in `cat ${TEMPLATES_FILE}`
do
AssignParams ${line}
done
IFS=$PRE_IFS
templates.sh
[download]
#!/bin/sh
#PBS -q [${n_node}]
#PBS -l select=[${n_node}]:ncpus=[${n_cpus}]:mpiprocs=[${n_MPIprocs}]:ompthreads=[${n_OMPthreads}]
#PBS -l walltime=[${walltime}]
#PBS -W group_list=[${group_list}]
DB_DIR="db"
RNA_DIR="RNAseq"
function DecompressHandler() {
: '
@params ${1} Extension 1
@params ${2} Extension 2
@params ${3} File Name (hoge.${1}.${2})
'
n_compressed_fn=${#3}
if [ ${2} = "zip" ]; then
unzip ${3}
n_extension=4
elif [ ${2} = "tar" ]; then
tar ${3}
n_extension=4
elif [ ${2} = "gz" ]; then
gunzip ${3}
n_extension=3
elif [ ${2} = "bz2" ]; then
bzip2 -d ${3}
n_extension=4
elif [ ${2} = "lha" -o ${2} = "lzh"]; then
lha x ${3}
n_extension=4
elif [ ${1} = "tar" ]; then
tar ${3}
n_extension=$((${#2}+5))
else
n_extension=0
fi
de_compressed_fn=${3:0:$(($n_compressed_fn-$n_extension))}
mv $de_compressed_fn "../${DB_DIR}/"
if [ ${n_extension} -ne 0 ]; then
rm ${3}
fi
echo "${de_compressed_fn}"
}
# Download, Decompress, Dispose
function D3() {
: '
@params ${1} URL
'
fn=`wget -nv --content-disposition $1 2>&1 |cut -d\" -f2`
extensions=( `echo $fn | tr -s '.' ' '`)
n_extensions=${#extensions[@]}
ext1=${extensions[$(($n_extensions-2))]}
ext2=${extensions[$(($n_extensions-1))]}
DecompressHandler $ext1 $ext2 $fn
}
#=== START ===
cd $PBS_O_WORKDIR/$RNA_DIR}
if [ ! -d $DB_DIR ]; then
mkdir $DB_DIR
fi
if [ ! -d $RNA_DIR ]; then
mkdir $RNA_DIR
fi
# 1.データの取得
SRA_FILE=`D3 [$RNAseqData]`
mv "../${DB_DIR}/$SRA_FILE" .
fasterq-dump $SRA_FILE -v --threads [$n_OMPthreads] --split-files -O ./
# 2.品質チェック
fastqc -t [$n_OMPthreads] "${SRA_FILE}_1.fastq" "${SRA_FILE}_2.fastq"
# 3.マッピング
REF_GENOME_FILE=`D3 [$MappingRefGenome]`
time hisat2 -x "../${DB_DIR}/${REF_GENOME_FILE}/genome" -1 "${SRA_FILE}_1.fastq" -2 "${SRA_FILE}_2.fastq" -p [$n_OMPthreads] -S "hisat_output_[${ID}].sam"
# 4. IGVに必要なインデックスファイル作成
samtools view --threads [$n_OMPthreads] -b "hisat_output_[${ID}].sam" -o "hisat_output_[${ID}].bam"
samtools sort --threads [$n_OMPthreads] "hisat_output_[${ID}].bam" -o "hisat_output_[${ID}].sorted.bam"
# 6. リード数のカウント
GENE_ANNO_FILE=`D3 [$GeneAnnotation]`
featureCounts hisat_output_ERR315326.bam -p -t exon -g gene_id -s 0 -T [$n_OMPthreads] -BC -a "../${DB_DIR}/${GENE_ANNO_FILE}" -o "Counts_BC_[${ID}].txt"
featureCounts hisat_output_ERR315326.bam -p -t exon -g gene_id -s 0 -T [$n_OMPthreads] -MOBC -a "../${DB_DIR}/${GENE_ANNO_FILE}" -o "Counts_MOBC_[${ID}].txt"
batch_u_qsub.sh
[download]
#!/bin/sh
#PBS -q 1
#PBS -l select=1:ncpus=1:mpiprocs=1:ompthreads=8
#PBS -l walltime=0:30:00
#PBS -W group_list=gj29
DB_DIR="db"
RNA_DIR="RNAseq"
function DecompressHandler() {
: '
@params ${1} Extension 1
@params ${2} Extension 2
@params ${3} File Name (hoge.${1}.${2})
'
n_compressed_fn=${#3}
if [ ${2} = "zip" ]; then
unzip ${3}
n_extension=4
elif [ ${2} = "tar" ]; then
tar ${3}
n_extension=4
elif [ ${2} = "gz" ]; then
gunzip ${3}
n_extension=3
elif [ ${2} = "bz2" ]; then
bzip2 -d ${3}
n_extension=4
elif [ ${2} = "lha" -o ${2} = "lzh"]; then
lha x ${3}
n_extension=4
elif [ ${1} = "tar" ]; then
tar ${3}
n_extension=$((${#2}+5))
else
n_extension=0
fi
de_compressed_fn=${3:0:$(($n_compressed_fn-$n_extension))}
mv $de_compressed_fn "../${DB_DIR}/"
if [ ${n_extension} -ne 0 ]; then
rm ${3}
fi
echo "${de_compressed_fn}"
}
# Download, Decompress, Dispose
function D3() {
: '
@params ${1} URL
'
fn=`wget -nv --content-disposition $1 2>&1 |cut -d\" -f2`
extensions=( `echo $fn | tr -s '.' ' '`)
n_extensions=${#extensions[@]}
ext1=${extensions-2}
ext2=${extensions-1}
DecompressHandler $ext1 $ext2 $fn
}
#=== START ===
cd $PBS_O_WORKDIR/$RNA_DIR}
if [ ! -d $DB_DIR ]; then
mkdir $DB_DIR
fi
if [ ! -d $RNA_DIR ]; then
mkdir $RNA_DIR
fi
# 1.データの取得
SRA_FILE=`D3 ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/sralite/ByExp/litesra/ERX/ERX288/ERX288491/ERR315326/ERR315326.sra`
mv "../${DB_DIR}/$SRA_FILE" .
fasterq-dump $SRA_FILE -v --threads 8 --split-files -O ./
# 2.品質チェック
fastqc -t 8 "${SRA_FILE}_1.fastq" "${SRA_FILE}_2.fastq"
# 3.マッピング
REF_GENOME_FILE=`D3 https://cloud.biohpc.swmed.edu/index.php/s/grch38/download`
time hisat2 -x "../${DB_DIR}/${REF_GENOME_FILE}/genome" -1 "${SRA_FILE}_1.fastq" -2 "${SRA_FILE}_2.fastq" -p 8 -S "hisat_output_ERR315326.sam"
# 4. IGVに必要なインデックスファイル作成
samtools view --threads 8 -b "hisat_output_ERR315326.sam" -o "hisat_output_ERR315326.bam"
samtools sort --threads 8 "hisat_output_ERR315326.bam" -o "hisat_output_ERR315326.sorted.bam"
# 6. リード数のカウント
GENE_ANNO_FILE=`D3 ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.annotation.gtf.gz`
featureCounts hisat_output_ERR315326.bam -p -t exon -g gene_id -s 0 -T 8 -BC -a "../${DB_DIR}/${GENE_ANNO_FILE}" -o "Counts_BC_ERR315326.txt"
featureCounts hisat_output_ERR315326.bam -p -t exon -g gene_id -s 0 -T 8 -MOBC -a "../${DB_DIR}/${GENE_ANNO_FILE}" -o "Counts_MOBC_ERR315326.txt"