Environment Setup for Navin Lab compute cluster. Some installs may be redundant with the cluster…
Login
ssh mulqueen@10.132.80.157
pwd
#/volumes/USR2/Ryan
Make directories for src (usually my own code) and tools (other peoples analysis pipelines).
mkdir tools
mkdir src
Download and install conda
cd tools
wget https://repo.anaconda.com/miniconda/Miniconda3-py310_22.11.1-1-Linux-x86_64.sh
bash Miniconda3-py310_22.11.1-1-Linux-x86_64.sh
Conda updates the .bashrc, so next time you log in it is properly in your PATH.
Logout and login to activate conda
conda create -n r4.2
conda activate r4.2
conda install -c conda-forge r-base scipy numpy
#add conda environment to .bashrc
echo "conda activate r4.2" >> ~/.bashrc
Log out and login to automatic load in of conda environment.
bcl2fastq2
bwa bowtie2 hisat2 bismark (methylation)
samtools
bedtools
macs3
#bcl2fastq2
conda install -c bih-cubi bcl2fastq2
#bwa
cd ~/tools
git clone https://github.com/lh3/bwa.git
cd bwa; make
#bowtie2
cd ~/tools
wget https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.5.1/bowtie2-2.5.1-linux-x86_64.zip
#hisat2
conda install -c bioconda hisat2
#bismark
cd ~/tools
wget https://github.com/FelixKrueger/Bismark/archive/refs/tags/0.24.0.tar.gz
#samtools
conda install -c bioconda samtools
#bedtools
conda install -c bioconda bedtools
#seqkit
conda install -c bioconda seqkit
#macs3
pip install macs3
#Add to PATH (for those not conda installed)
echo "PATH=$PATH:~/tools/bowtie2-2.5.1-linux-x86_64" >> ~/.bashrc
echo "PATH=$PATH:~/tools/bwa" >> ~/.bashrc
echo "PATH=$PATH:~/tools/Bismark-0.24.0/" >> ~/.bashrc
#multiqc
pip install multiqc
Now install all the R packages.
Install package dependencies in the environment
conda install -c conda-forge r-xml
conda install -c conda-forge r-gert
conda install -c conda-forge r-ragg
conda install -c conda-forge r-spdep
conda install -c conda-forge r-terra
conda install -c conda-forge zlib
I’m sure there are a ton that I’m missing. But I’m getting started with these.
install.packages(c("BiocManager","devtools"))
install.packages(c("Seurat","Signac","harmony","vcfR")) #biocmanager install is necessary for signac
BiocManager::install(c("cicero","ComplexHeatmap"))
devtools::install_github('cole-trapnell-lab/monocle3')
devtools::install_github("navinlabcode/copykit")
https://github.com/4dn-dcic/pairix#installation-for-pairix Pairix download and installation
cd ~/tools
git clone https://github.com/4dn-dcic/pairix
cd pairix
make
#and then add to path
Installing cooler on alternative conda env
conda deactivate #get out of r3.4 env
conda create -n "cooler_env" python=3.9.15
conda activate cooler_env
pip install cooler
pip install Cython
pip install cooltools #need to downgrate python for this
pip install pypairix
pip install plotly seaborn
Installing schicluster https://github.com/zhoujt1994/scHiCluster
conda create -n schicluster python==3.6.8
conda activate schicluster
pip install git+https://github.com/zhoujt1994/scHiCluster.git
https://zhoujt1994.github.io/scHiCluster/intro.html
Installing EagleC https://github.com/XiaoTaoWang/EagleC
Additionally installing NeoLoopFinder in the same EagleC environment. https://github.com/XiaoTaoWang/NeoLoopFinder
conda install mamba -n base -c conda-forge
conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
mamba create -n EagleC scikit-learn statsmodels matplotlib cooler pyBigWig pyensembl python=3.8 joblib=1.0.1 tensorflow=2 cython=0.29.24
conda activate EagleC
mamba install cooler matplotlib pyensembl pybigwig intervaltree scikit-learn=1.1.2 joblib=1.1.0 rpy2 r-mgcv
mamba install -c bioconda bedtools
pip install eaglec
pip install numpy==1.21
download-pretrained-models
#mamba install -c anaconda pomegranate=0.14.4
pip install pomegranate==0.14.4
pip install -U neoloop TADLib
pip install cooltools
conda install sniffles=2.2 #for ONT data
#quick start testing of eaglec
cd ~/ref
wget -O SKNAS-MboI-allReps-filtered.mcool -L https://www.dropbox.com/s/f80bgn11d7wfgq8/SKNAS-MboI-allReps-filtered.mcool?dl=0
predictSV --hic-5k SKNAS-MboI-allReps-filtered.mcool::/resolutions/5000 \
--hic-10k SKNAS-MboI-allReps-filtered.mcool::/resolutions/10000 \
--hic-50k SKNAS-MboI-allReps-filtered.mcool::/resolutions/50000 \
-O SK-N-AS -g hg38 --balance-type CNV --output-format full \
--prob-cutoff-5k 0.8 --prob-cutoff-10k 0.8 --prob-cutoff-50k 0.99999
#testing install of neoloopfinder
cd ~/ref
wget -O SKNMC-MboI-allReps-filtered.mcool -L https://www.dropbox.com/s/tuhhrecipkp1u8k/SKNMC-MboI-allReps-filtered.mcool?dl=0
calculate-cnv -H SKNMC-MboI-allReps-filtered.mcool::resolutions/25000 -g hg38 \
-e MboI --output SKNMC_25k.CNV-profile.bedGraph
segment-cnv --cnv-file ~/ref/SKNMC_25k.CNV-profile.bedGraph --binsize 25000 \
--ploidy 2 --output ~/ref/SKNMC_25k.CNV-seg.bedGraph --nproc 4
Installing bedGraphtoBigWig in EagleC environment (for plotting)
cd ~/tools
wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bedGraphToBigWig
chmod a+x bedGraphToBigWig
#add to path
Installing HiSV in cooler_env https://github.com/GaoLabXDU/HiSV
conda activate cooler_env
conda install gcc
pip install numpy pysam pandas prox_tv
#prox_tv was annoying but installed from git source
git clone https://github.com/gaolabXDU/HiSV
Installing HiC_breakfinder in cooler_env https://github.com/dixonlab/hic_breakfinder
cd ~/tools
git clone https://gitlab.com/libeigen/eigen.git #https://eigen.tuxfamily.org/dox/GettingStarted.html
git clone https://github.com/pezmaster31/bamtools.git #https://github.com/pezmaster31/bamtools/wiki/Building-and-installing
conda install -c conda-forge jsoncpp
git clone https://github.com/dixonlab/hic_breakfinder.git
./configure --prefix=/volumes/USR2/Ryan/tools/hic_breakfinder/ CPPFLAGS="-I /volumes/USR2/Ryan/tools/bamtools/include -I /volumes/USR2/Ryan/tools/eigen" LDFLAGS="-L/volumes/USR2/Ryan/tools/bamtools/lib/"
git clone https://github.com/dixonlab/hic_breakfinder
conda install -c bioconda bamtools
conda install -c conda-forge eigen
./configure CPPFLAGS="-I /volumes/USR2/Ryan/miniconda3/envs/cooler_env/include/bamtools -I /volumes/USR2/Ryan/miniconda3/envs/cooler_env/include/eigen3/" LDFLAGS="-L/volumes/USR2/Ryan/miniconda3/envs/cooler_env/lib/bamtools"
make
Installation of samblaster https://github.com/GregoryFaust/samblaster
cd ~/tools
wget https://github.com/GregoryFaust/samblaster/archive/refs/heads/master.zip
unzip master.zip
cd samblaster-master
make
https://github.com/pk7zuva/Circle_finder
cd ~/tools
mkdir circle-finder
wget https://github.com/pk7zuva/Circle_finder/blob/3eb333db2ea6277dde36cbf640be9afeb710c717/circle_finder-pipeline-bwa-mem-samblaster.sh
Need a separate environment for scbs analysis because python version has to be lower.
https://github.com/LKremer/scbs https://www.bioconductor.org/packages/release/bioc/vignettes/Melissa/inst/doc/process_files.html
Naming a scbs conda environment as scbs_env
conda deactivate #get out of r3.4 env
conda create -n "scbs_env" python=3.9.15
conda activate scbs_env
python3 -m pip install scbs
conda install -c conda-forge r-xml r-gert r-ragg r-spdep r-terra r-stringi
scbs --version
#scbs, version 0.5.4
Install R packages for downstream analysis.
install.packages(c("BiocManager","devtools"))
BiocManager::install("Melissa")
Downloading mm10 and hg38 reference genomes. Using the cellranger ones for consistency with commercial products.
mkdir ~/ref
cd ~/ref
wget https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-GRCh38-2020-A-2.0.0.tar.gz
wget https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz
tar -xvf refdata-cellranger-arc-GRCh38-2020-A-2.0.0.tar.gz
tar -xvf refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz
Skipping mouse genome for now, but its similar
bwa index ~/ref/refdata-cellranger-arc-GRCh38-2020-A-2.0.0/fasta/genome.fa &
bismark_genome_preparation ~/ref/refdata-cellranger-arc-GRCh38-2020-A-2.0.0/fasta &
Installation of scDNA Replication tools https://github.com/shahcompbio/scdna_replication_tools/tree/main
git clone git@github.com:shahcompbio/scdna_replication_tools.git
cd scdna_replication_tools-main
conda create -n scdna_replication_tools python==3.7.4
conda activate scdna_replication_tools
python -m venv venv/
source venv/bin/activate
pip install numpy cython
pip install -r requirements3.txt
python setup.py develop
https://luminousmen.com/post/resolve-cython-and-numpy-dependencies
import numpy
from Cython.Build import cythonize
from setuptools import setup, Extension
setup(
...
setup_requires=[
'setuptools>=18.0',
'cython',
],
ext_modules=[
Extension('package.cython_code1', sources=['package/cython_code1.pyx']),
Extension('package.cython_code2', sources=['package/cython_code2.pyx']),
],
include_dirs=[numpy.get_include()],
)
Necessary tools for ONT validation of structural variant calls
#install singularity (conda)
conda install -c conda-forge singularity
#initialize epi2me
./nextflow run epi2me-labs/wf-human-variation --help
#test
OUTPUT=output
nextflow run epi2me-labs/wf-human-variation \
-w ${OUTPUT}/workspace \
-profile standard \
--snp --sv \
--bam demo_data/demo.bam \
--bed demo_data/demo.bed \
--ref demo_data/demo.fasta \
--basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' \
--sample_name MY_SAMPLE \
--out_dir ${OUTPUT} \
-with-singularity \
-without-docker
#success!
bash
#install dorado (prebuilt binary)
"""
mkdir ~/src
cd ~/src
wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz
tar -xvf dorado-0.3.4-linux-x64.tar.gz
#add to path in ~/.bashrc
#PATH=$PATH:/volumes/USR2/Ryan/tools/dorado-0.3.4-linux-x64/bin
#download dorado models
#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0
#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 #5khz
"""
#install nextflow
"""
cd ~/
curl -s "https://get.sdkman.io" | bash
source "$HOME/.sdkman/bin/sdkman-init.sh"
sdk install java 17.0.6-amzn
curl -s https://get.nextflow.io | bash
./nextflow self-update
mv ~/nextflow ~/tools #moving to in PATH
"""
#install docker from https://www.docker.com/products/docker-desktop/ using apple chip
"""
#run docker and it should be in the path
docker version
"""
#connect to server both USR2 and seq
ref="/Volumes/USR2/Ryan/ref/refdata-cellranger-arc-GRCh38-2020-A-2.0.0/fasta/genome.fa"
wd_out="/Volumes/seq/projects/gccACT/230808_mdamb231_ONT"
output_name="20230726_1239_2D_PAO38369_output" #change to each flowcell
pod5_dir="/Volumes/seq/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231/20230726_1239_2D_PAO38369_dde6ac95" #change to each flowcell
#connect through Finder to seq and USR2 (for reference genome.fa)
nextflow run epi2me-labs/wf-basecalling \
--input $pod5_dir \
--dorado_ext "pod5" \
--output_bam \
--cuda-device cuda:all \
--ref $ref \
--verbose \
--out_dir ${wd_out}/${output_name} \
--basecaller_cfg "dna_r10.4.1_e8.2_400bps_hac@v4.2.0" \
--remora_cfg "dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2"
trying Higashi for scHiC
cd ~/tools
git clone https://github.com/ma-compbio/Higashi/
cd Higashi
python setup.py install
ssh seadragon
bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=1:gmem=4 -M 16 -R rusage[mem=16] /bin/bash #get interactive gpu node
bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive transfer node this has internet access for environment set up
module load miniconda3/39_23.5.0; eval "$(/risapps/rhel7/miniconda3/py39_4.12.0/bin/conda shell.bash hook)"
module load cuda11.5/toolkit/11.5.1
conda create --name hic python=3.11 #installing to conda base
#install higashi from 4dn
conda activate hic
conda install -c conda-forge mamba
mamba install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
mamba install -c bioconda pybedtools cooler
conda install -c bioconda pybedtools
mamba install -c conda-forge zlib
mamba install -c ruochiz fasthigashi
# CUDA 10.2 (version of cuda in modules)
conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=10.2 -c pytorch
mkdir ~/tools
cd ~/tools
git clone https://github.com/ma-compbio/Higashi/
cd Higashi
conda install python==3.9 numpy==1.16.5
python setup.py install
conda create –name hic conda activate hic mamba install -c ruochiz fasthigashi q
wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz
Download data via transfer node
bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive transfer node this has internet access for environment set up
#ONT data
rsync \
-LPr mulqueen@10.132.80.157:/volumes/seq/projects/gccACT/230808_mdamb231_ONT ~/projects/gccACT
#dorado prebuilt
rsync \
-LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/tools/dorado-0.3.4-linux-x64.tar.gz ~/tools
#dorado reference genome
#download model for base calling
#wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz
#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 #5khz #cpg??
#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0
rsync \
-LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/dna_r10.4.1_e8.2_400bps_hac@v4.2.0 ~/
rsync \
-LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 ~/
#nextflow epi2me download
rsync \
-LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/wf-human-variation-master ~/
#my 10.132.80.157 references
rsync \
-LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/ref ~/
Use CUDA node
bsub -Is -W 6:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 16 -R rusage[mem=16] /bin/bash #get interactive gpu node
module load singularity/3.7.0
module load nextflow/23.04.3
module load cuda11.5/toolkit/11.5.1
module load samtools/1.15
#module load dorado
#module load wf-human-variation
OUTPUT=output
nextflow run ./wf-human-variation-master/main.nf \
-w ${OUTPUT}/workspace \
--snp --sv --bam demo_data/demo.bam \
--bed demo_data/demo.bed \
--ref demo_data/demo.fasta \
--basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' \
--sample_name MY_SAMPLE \
--out_dir ${OUTPUT} \
-with-singularity -without-docker
# If you want to start with a FASTQ, you can generate an unaligned BAM from FASTQ with samtools import http://www.htslib.org/doc/samtools-import.html and allow the workflow to take care of mapping.
#CPU Node Run
bsub -Is -W 6:00 -q medium -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive cpu node
#module load singularity/3.7.0
#module load nextflow/23.04.3
#module load cuda11.5/toolkit/11.5.1
module load samtools/1.15
#module load dorado
ref="/rsrch4/home/genetics/rmulqueen/ref/genome.fa"
wd_out="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT"
output_name="20230726_1239_2D_PAO38369_output" #change to each flowcell
pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231/20230726_1239_2D_PAO38369_dde6ac95" #change to each flowcell
dorado basecaller \
--verbose \
--device cpu \
--reference ${ref} \
--emit-sam \
--max-reads 100 \
--batchsize 64 \
'dna_r10.4.1_e8.2_400bps_hac@v4.2.0' \
${pod5_dir}/pod5_pass/ \
#untested for this part, but use bam from wf-basecalling output as input
nextflow run ~/wf-human-variation-master/main.nf \
-w ${wd_out}/${output_name}/workspace \
-profile singularity \
--snp --sv --cnv --methyl \
--ref ${ref} \
--bam ${wd_out}/${output_name}.bam \
--dorado_ext pod5 \
--basecaller_basemod_threads 40 \
--basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.2.0' \
--remora_cfg 'dna_r10.4.1_e8.2_400bps_sup@v4.2.0_5mCG_5hmCG@v2' \
--sample_name ${output_name} \
--out_dir ${wd_out}/${output_name}/ \
-with-singularity \
-without-docker
ssh r1prpsciapp13
module purge
module load nextflow/23.04.3
module load singularity/3.7.0
#manually pull singularity image?
nextflow pull epi2me-labs/wf-human-variation
singularity pull --name ontresearch-wf-human-variation-shac4db03c19b6ff1277a24ec28a19e564d628d478f.img.pulling.1669977561040 docker://ontresearch/wf-human-variation:shac4db03c19b6ff1277a24ec28a19e564d628d478f
> /dev/null
sif_in="ontresearch-wf-human-variation-shac4db03c19b6ff1277a24ec28a19e564d628d478f.img.pulling.1669977561040"
nextflow run ~/wf-human-variation-master/main.nf
OUTPUT=output
nextflow run ~/wf-human-variation-master/main.nf \
-w ${OUTPUT}/workspace \
-profile standard \
--snp --sv --bam demo_data/demo.bam \
--bed demo_data/demo.bed \
--ref demo_data/demo.fasta \
--basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' \
--sample_name MY_SAMPLE --out_dir ${OUTPUT} \
-without-docker -with-singularity $sif_in
tar -xzf /risapps/tps_source/tps_source/wf-human-variation/demo_data.tar.gz
OUTPUT=output
nextflow run wf-human-variation-master -w ${OUTPUT}/workspace -profile standard --snp --sv --bam demo_data/demo.bam --bed demo_data/demo.bed --ref demo_data/demo.fasta --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' --sample_name MY_SAMPLE --out_dir ${OUTPUT} -with-singularity -without-docker
bsub -q “gpu-medium” dorado_test.bsub