3A
  • Portfolio Top
  • Categories
  • Tags
  • Archives

RNA-seqデータ解析(バッチスクリプト版)

今回は、RNA-seqデータ解析(対話モード版)で行なった処理を毎回対話モードで行うのは面倒なので、バッチスクリプトを作成して一括で行います。

以下のパラメタファイル(params.json)を読み込んで各種操作を行います。

{
  "qsub_q": "u-debug",
  "n_node": 1,
  "n_cpus": 1,
  "n_MPIprocs": 1,
  "n_OMPthreads": 8,
  "walltime": "0:30:00",
  "group_list": "gj29",

  "ID": "ERR315326",

  "RNAseqData": "ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/sralite/ByExp/litesra/ERX/ERX288/ERX288491/ERR315326/ERR315326.sra",
  "MappingRefGenome": "https://cloud.biohpc.swmed.edu/index.php/s/grch38/download",
  "GeneAnnotation": "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.annotation.gtf.gz"
}

各変数は、 jqコマンドを使って以下のように読み込みます。

[変数名]=$(cat [パラメタファイルへのパス] | jq -r .[key名])
[変数名]=`jq -r .[key名] [パラメタファイルへのパス]`

成果物

make_batch_u_qsub.sh という以下のシェルスクリプトを叩くことで、params.json の内容を踏まえたバッチスクリプト batch_u_qsub.sh を作成します。

make_batch_u_qsub.sh [download]

#!/bin/bash

Ssymbol="["
Esymbol="]"

function AssignParams () {
  << COMMENTOUT
  @params params.json     {"qsub_q": "u-debug"}
  @params templates.sh    PBS -q [${n_node}]
  @return batch_u_qsub.sh PBS -q u-debug
  COMMENTOUT

  line="${1}"
  while true
  do
    Spos=`echo "${line}" | awk -v pattern=${Ssymbol} '{print index($0, pattern)}'`
    Epos=`echo "${line}" | awk -v pattern=${Esymbol} '{print index($0, pattern)}'`
    if [ ${Spos} -ne 0 -a ${Epos} -ne 0 -a ${Spos} -lt ${Epos} ]; then
      PREFIX=`echo "${line:0:$(($Spos-1))}"`
      VARIABLE=`echo "${line}" | cut -c "$(($Spos+1))-$(($Epos-1))"`
      CONTENT=`eval echo ${VARIABLE}`
      SUFFIX=`echo "${line:$Epos}"`
      line="${PREFIX}${CONTENT}${SUFFIX}"
    else
      break
    fi
  done
  echo "${line}" >> $OUTPUT_FILE
}

echo -n "Path to params file (.json): "
read PARAMS_FILE
# Convert each element of json to a variable.
KEYS=`jq -r 'keys[]' $PARAMS_FILE`
for key in $KEYS; do
  eval $key=`jq -r .${key} $PARAMS_FILE`
done

OUTPUT_FILE="${ID}/batch_u_qsub.sh"
if [ ! -d ${ID} ]; then
  mkdir ${ID}
fi

echo -n "Path to templates file (.sh): "
read TEMPLATES_FILE

PRE_IFS=$IFS
IFS=$'\n'
for line in `cat ${TEMPLATES_FILE}`
do
  AssignParams ${line}
done
IFS=$PRE_IFS

templates.sh [download]

#!/bin/sh
#PBS -q [${n_node}]
#PBS -l select=[${n_node}]:ncpus=[${n_cpus}]:mpiprocs=[${n_MPIprocs}]:ompthreads=[${n_OMPthreads}]
#PBS -l walltime=[${walltime}]
#PBS -W group_list=[${group_list}]

DB_DIR="db"
RNA_DIR="RNAseq"

function DecompressHandler() {
  : '
  @params ${1} Extension 1
  @params ${2} Extension 2
  @params ${3} File Name (hoge.${1}.${2})
  '

  n_compressed_fn=${#3}

  if [ ${2} = "zip" ]; then
    unzip ${3}
    n_extension=4
  elif [ ${2} = "tar" ]; then
    tar ${3}
    n_extension=4
  elif [ ${2} = "gz" ]; then
    gunzip ${3}
    n_extension=3
  elif [ ${2} = "bz2" ]; then
    bzip2 -d ${3}
    n_extension=4
  elif [ ${2} = "lha" -o ${2} = "lzh"]; then
    lha x ${3}
    n_extension=4
  elif [ ${1} = "tar" ]; then
    tar ${3}
    n_extension=$((${#2}+5))
  else
    n_extension=0
  fi

  de_compressed_fn=${3:0:$(($n_compressed_fn-$n_extension))}
  mv $de_compressed_fn "../${DB_DIR}/"
  if [ ${n_extension} -ne 0 ]; then
    rm ${3}
  fi

  echo "${de_compressed_fn}"
}

# Download, Decompress, Dispose
function D3() {
  : '
  @params ${1} URL
  '

  fn=`wget -nv --content-disposition $1 2>&1 |cut -d\" -f2`

  extensions=( `echo $fn | tr -s '.' ' '`)
  n_extensions=${#extensions[@]}

  ext1=${extensions[$(($n_extensions-2))]}
  ext2=${extensions[$(($n_extensions-1))]}
  DecompressHandler $ext1 $ext2 $fn
}

#=== START ===
cd $PBS_O_WORKDIR/$RNA_DIR}
if [ ! -d $DB_DIR ]; then
  mkdir $DB_DIR
fi
if [ ! -d $RNA_DIR ]; then
  mkdir $RNA_DIR
fi

# 1.データの取得
SRA_FILE=`D3 [$RNAseqData]`
mv "../${DB_DIR}/$SRA_FILE" .
fasterq-dump $SRA_FILE -v --threads [$n_OMPthreads] --split-files -O ./
# 2.品質チェック
fastqc -t [$n_OMPthreads] "${SRA_FILE}_1.fastq" "${SRA_FILE}_2.fastq"
# 3.マッピング
REF_GENOME_FILE=`D3 [$MappingRefGenome]`
time hisat2 -x "../${DB_DIR}/${REF_GENOME_FILE}/genome" -1 "${SRA_FILE}_1.fastq" -2 "${SRA_FILE}_2.fastq" -p [$n_OMPthreads] -S "hisat_output_[${ID}].sam"
# 4. IGVに必要なインデックスファイル作成
samtools view --threads [$n_OMPthreads] -b "hisat_output_[${ID}].sam" -o "hisat_output_[${ID}].bam"
samtools sort --threads [$n_OMPthreads] "hisat_output_[${ID}].bam" -o "hisat_output_[${ID}].sorted.bam"
# 6. リード数のカウント
GENE_ANNO_FILE=`D3 [$GeneAnnotation]`
featureCounts hisat_output_ERR315326.bam -p -t exon -g gene_id -s 0 -T [$n_OMPthreads] -BC -a "../${DB_DIR}/${GENE_ANNO_FILE}" -o "Counts_BC_[${ID}].txt"
featureCounts hisat_output_ERR315326.bam -p -t exon -g gene_id -s 0 -T [$n_OMPthreads] -MOBC -a "../${DB_DIR}/${GENE_ANNO_FILE}" -o "Counts_MOBC_[${ID}].txt"

batch_u_qsub.sh [download]

#!/bin/sh
#PBS -q 1
#PBS -l select=1:ncpus=1:mpiprocs=1:ompthreads=8
#PBS -l walltime=0:30:00
#PBS -W group_list=gj29
DB_DIR="db"
RNA_DIR="RNAseq"
function DecompressHandler() {
  : '
  @params ${1} Extension 1
  @params ${2} Extension 2
  @params ${3} File Name (hoge.${1}.${2})
  '
  n_compressed_fn=${#3}
  if [ ${2} = "zip" ]; then
    unzip ${3}
    n_extension=4
  elif [ ${2} = "tar" ]; then
    tar ${3}
    n_extension=4
  elif [ ${2} = "gz" ]; then
    gunzip ${3}
    n_extension=3
  elif [ ${2} = "bz2" ]; then
    bzip2 -d ${3}
    n_extension=4
  elif [ ${2} = "lha" -o ${2} = "lzh"]; then
    lha x ${3}
    n_extension=4
  elif [ ${1} = "tar" ]; then
    tar ${3}
    n_extension=$((${#2}+5))
  else
    n_extension=0
  fi
  de_compressed_fn=${3:0:$(($n_compressed_fn-$n_extension))}
  mv $de_compressed_fn "../${DB_DIR}/"
  if [ ${n_extension} -ne 0 ]; then
    rm ${3}
  fi
  echo "${de_compressed_fn}"
}
# Download, Decompress, Dispose
function D3() {
  : '
  @params ${1} URL
  '
  fn=`wget -nv --content-disposition $1 2>&1 |cut -d\" -f2`
  extensions=( `echo $fn | tr -s '.' ' '`)
  n_extensions=${#extensions[@]}
  ext1=${extensions-2}
  ext2=${extensions-1}
  DecompressHandler $ext1 $ext2 $fn
}
#=== START ===
cd $PBS_O_WORKDIR/$RNA_DIR}
if [ ! -d $DB_DIR ]; then
  mkdir $DB_DIR
fi
if [ ! -d $RNA_DIR ]; then
  mkdir $RNA_DIR
fi
# 1.データの取得
SRA_FILE=`D3 ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/sralite/ByExp/litesra/ERX/ERX288/ERX288491/ERR315326/ERR315326.sra`
mv "../${DB_DIR}/$SRA_FILE" .
fasterq-dump $SRA_FILE -v --threads 8 --split-files -O ./
# 2.品質チェック
fastqc -t 8 "${SRA_FILE}_1.fastq" "${SRA_FILE}_2.fastq"
# 3.マッピング
REF_GENOME_FILE=`D3 https://cloud.biohpc.swmed.edu/index.php/s/grch38/download`
time hisat2 -x "../${DB_DIR}/${REF_GENOME_FILE}/genome" -1 "${SRA_FILE}_1.fastq" -2 "${SRA_FILE}_2.fastq" -p 8 -S "hisat_output_ERR315326.sam"
# 4. IGVに必要なインデックスファイル作成
samtools view --threads 8 -b "hisat_output_ERR315326.sam" -o "hisat_output_ERR315326.bam"
samtools sort --threads 8 "hisat_output_ERR315326.bam" -o "hisat_output_ERR315326.sorted.bam"
# 6. リード数のカウント
GENE_ANNO_FILE=`D3 ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.annotation.gtf.gz`
featureCounts hisat_output_ERR315326.bam -p -t exon -g gene_id -s 0 -T 8 -BC -a "../${DB_DIR}/${GENE_ANNO_FILE}" -o "Counts_BC_ERR315326.txt"
featureCounts hisat_output_ERR315326.bam -p -t exon -g gene_id -s 0 -T 8 -MOBC -a "../${DB_DIR}/${GENE_ANNO_FILE}" -o "Counts_MOBC_ERR315326.txt"

  • « RNA-seqデータ解析(対話モード版)
  • 分子生命科学Ⅲ 第4回 »
hidden
Table of Contents
Published
Oct 17, 2019
Last Updated
Oct 17, 2019
Category
情報基礎実験(浅井)
Tags
  • 3A 127
  • 情報基礎実験(浅井) 13
Contact
Other contents
  • Home
  • Blog
  • Front-End
  • Kerasy
  • Python-Charmers
  • Translation-Gummy
    • 3A - Shuto's Notes
    • MIT
    • Powered by Pelican. Theme: Elegant