Environment Setup

Environment Setup for Navin Lab compute cluster. Some installs may be redundant with the cluster…


Make directories for src (usually my own code) and tools (other peoples analysis pipelines).

mkdir tools
mkdir src

Download and install conda

cd tools
wget https://repo.anaconda.com/miniconda/Miniconda3-py310_22.11.1-1-Linux-x86_64.sh
bash Miniconda3-py310_22.11.1-1-Linux-x86_64.sh

Conda updates the .bashrc, so next time you log in it is properly in your PATH.

Logout and login to activate conda

conda create -n r4.2 
conda activate r4.2
conda install -c conda-forge r-base scipy numpy
#add conda environment to .bashrc
echo "conda activate r4.2" >> ~/.bashrc

Log out and login to automatic load in of conda environment.

General tools

FastQ Generation:



bwa bowtie2 hisat2 bismark (methylation)

Fastq/sam/bam manipulation


Bed file manipulation


ATAC peak calling


conda install -c bih-cubi bcl2fastq2
cd ~/tools
git clone https://github.com/lh3/bwa.git
cd bwa; make

cd ~/tools
wget https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.5.1/bowtie2-2.5.1-linux-x86_64.zip

conda install -c bioconda hisat2

cd ~/tools
wget https://github.com/FelixKrueger/Bismark/archive/refs/tags/0.24.0.tar.gz

conda install -c bioconda samtools

conda install -c bioconda bedtools

conda install -c bioconda seqkit

pip install macs3

#Add to PATH (for those not conda installed)
echo "PATH=$PATH:~/tools/bowtie2-2.5.1-linux-x86_64" >> ~/.bashrc
echo "PATH=$PATH:~/tools/bwa" >> ~/.bashrc
echo "PATH=$PATH:~/tools/Bismark-0.24.0/"  >> ~/.bashrc

pip install multiqc

General RNA and ATAC processing can be done with Seurat and Signac in R

Now install all the R packages.

Install package dependencies in the environment

conda install -c conda-forge r-xml
conda install -c conda-forge r-gert
conda install -c conda-forge r-ragg
conda install -c conda-forge r-spdep
conda install -c conda-forge r-terra
conda install -c conda-forge zlib

I’m sure there are a ton that I’m missing. But I’m getting started with these.

install.packages(c("Seurat","Signac","harmony","vcfR")) #biocmanager install is necessary for signac

HiC Data Analysis can be done with the DipC group’s released hickit

https://github.com/4dn-dcic/pairix#installation-for-pairix Pairix download and installation

cd ~/tools
git clone https://github.com/4dn-dcic/pairix
cd pairix
#and then add to path

Installing cooler on alternative conda env

conda deactivate #get out of r3.4 env
conda create -n "cooler_env" python=3.9.15
conda activate cooler_env
pip install cooler
pip install Cython
pip install cooltools #need to downgrate python for this
pip install pypairix
pip install plotly seaborn

Installing schicluster https://github.com/zhoujt1994/scHiCluster

conda create -n schicluster python==3.6.8
conda activate schicluster
pip install git+https://github.com/zhoujt1994/scHiCluster.git

Testing of SV detectors

Installing EagleC https://github.com/XiaoTaoWang/EagleC

Additionally installing NeoLoopFinder in the same EagleC environment. https://github.com/XiaoTaoWang/NeoLoopFinder

conda install mamba -n base -c conda-forge
conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
mamba create -n EagleC scikit-learn statsmodels matplotlib cooler pyBigWig pyensembl python=3.8 joblib=1.0.1 tensorflow=2 cython=0.29.24

conda activate EagleC
mamba install cooler matplotlib pyensembl pybigwig intervaltree scikit-learn=1.1.2 joblib=1.1.0 rpy2 r-mgcv

mamba install -c bioconda bedtools

pip install eaglec
pip install numpy==1.21

#mamba install -c anaconda pomegranate=0.14.4
pip install pomegranate==0.14.4
pip install -U neoloop TADLib
pip install cooltools
conda install sniffles=2.2 #for ONT data

#quick start testing of eaglec
cd ~/ref
wget -O SKNAS-MboI-allReps-filtered.mcool -L https://www.dropbox.com/s/f80bgn11d7wfgq8/SKNAS-MboI-allReps-filtered.mcool?dl=0
predictSV --hic-5k SKNAS-MboI-allReps-filtered.mcool::/resolutions/5000 \
            --hic-10k SKNAS-MboI-allReps-filtered.mcool::/resolutions/10000 \
            --hic-50k SKNAS-MboI-allReps-filtered.mcool::/resolutions/50000 \
            -O SK-N-AS -g hg38 --balance-type CNV --output-format full \
            --prob-cutoff-5k 0.8 --prob-cutoff-10k 0.8 --prob-cutoff-50k 0.99999

#testing install of neoloopfinder
cd ~/ref
wget -O SKNMC-MboI-allReps-filtered.mcool -L https://www.dropbox.com/s/tuhhrecipkp1u8k/SKNMC-MboI-allReps-filtered.mcool?dl=0
calculate-cnv -H SKNMC-MboI-allReps-filtered.mcool::resolutions/25000 -g hg38 \
                -e MboI --output SKNMC_25k.CNV-profile.bedGraph

segment-cnv --cnv-file ~/ref/SKNMC_25k.CNV-profile.bedGraph --binsize 25000 \
              --ploidy 2 --output ~/ref/SKNMC_25k.CNV-seg.bedGraph --nproc 4

Installing bedGraphtoBigWig in EagleC environment (for plotting)

cd ~/tools
wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bedGraphToBigWig
chmod a+x bedGraphToBigWig
#add to path

Installing HiSV in cooler_env https://github.com/GaoLabXDU/HiSV

conda activate cooler_env
conda install gcc
pip install numpy pysam pandas prox_tv
#prox_tv was annoying but installed from git source
git clone https://github.com/gaolabXDU/HiSV

Installing HiC_breakfinder in cooler_env https://github.com/dixonlab/hic_breakfinder

cd ~/tools
git clone https://gitlab.com/libeigen/eigen.git #https://eigen.tuxfamily.org/dox/GettingStarted.html
git clone https://github.com/pezmaster31/bamtools.git #https://github.com/pezmaster31/bamtools/wiki/Building-and-installing
conda install -c conda-forge jsoncpp

git clone https://github.com/dixonlab/hic_breakfinder.git

./configure --prefix=/volumes/USR2/Ryan/tools/hic_breakfinder/ CPPFLAGS="-I /volumes/USR2/Ryan/tools/bamtools/include -I /volumes/USR2/Ryan/tools/eigen" LDFLAGS="-L/volumes/USR2/Ryan/tools/bamtools/lib/"

git clone https://github.com/dixonlab/hic_breakfinder

 conda install -c bioconda bamtools
 conda install -c conda-forge eigen
./configure CPPFLAGS="-I /volumes/USR2/Ryan/miniconda3/envs/cooler_env/include/bamtools -I /volumes/USR2/Ryan/miniconda3/envs/cooler_env/include/eigen3/" LDFLAGS="-L/volumes/USR2/Ryan/miniconda3/envs/cooler_env/lib/bamtools"

Samblaster for eccDNA

Installation of samblaster https://github.com/GregoryFaust/samblaster

cd ~/tools
wget https://github.com/GregoryFaust/samblaster/archive/refs/heads/master.zip
unzip master.zip
cd samblaster-master

Circle-finder for eccDNA


cd ~/tools
mkdir circle-finder
wget https://github.com/pk7zuva/Circle_finder/blob/3eb333db2ea6277dde36cbf640be9afeb710c717/circle_finder-pipeline-bwa-mem-samblaster.sh

Separate Conda Env: scbs_env

Need a separate environment for scbs analysis because python version has to be lower.

Methylation Analysis with scbs

https://github.com/LKremer/scbs https://www.bioconductor.org/packages/release/bioc/vignettes/Melissa/inst/doc/process_files.html

Naming a scbs conda environment as scbs_env

conda deactivate #get out of r3.4 env
conda create -n "scbs_env" python=3.9.15
conda activate scbs_env
python3 -m pip install scbs
conda install -c conda-forge r-xml r-gert r-ragg r-spdep r-terra r-stringi
scbs --version                                                                                    
#scbs, version 0.5.4            

Install R packages for downstream analysis.


References Download

Downloading mm10 and hg38 reference genomes. Using the cellranger ones for consistency with commercial products.

mkdir ~/ref
cd ~/ref
wget https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-GRCh38-2020-A-2.0.0.tar.gz
wget https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz

tar -xvf refdata-cellranger-arc-GRCh38-2020-A-2.0.0.tar.gz
tar -xvf refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz

Prepare Reference Genomes For Aligners

Skipping mouse genome for now, but its similar

bwa index ~/ref/refdata-cellranger-arc-GRCh38-2020-A-2.0.0/fasta/genome.fa &

bismark_genome_preparation ~/ref/refdata-cellranger-arc-GRCh38-2020-A-2.0.0/fasta &

Installation of scDNA Replication tools https://github.com/shahcompbio/scdna_replication_tools/tree/main

git clone git@github.com:shahcompbio/scdna_replication_tools.git
cd scdna_replication_tools-main
conda create -n scdna_replication_tools python==3.7.4
conda activate scdna_replication_tools
python -m venv venv/
source venv/bin/activate
pip install numpy cython
pip install -r requirements3.txt
python setup.py develop


import numpy
from Cython.Build import cythonize
from setuptools import setup, Extension

        Extension('package.cython_code1', sources=['package/cython_code1.pyx']),
        Extension('package.cython_code2', sources=['package/cython_code2.pyx']),

Installing Singularity, Nextflow and ONT Analysis Pipeline

Necessary tools for ONT validation of structural variant calls

#install singularity (conda)
conda install -c conda-forge singularity

#initialize epi2me
./nextflow run epi2me-labs/wf-human-variation --help

nextflow run epi2me-labs/wf-human-variation \
    -w ${OUTPUT}/workspace \
    -profile standard \
    --snp --sv \
    --bam demo_data/demo.bam \
    --bed demo_data/demo.bed \
    --ref demo_data/demo.fasta \
    --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0'  \
    --sample_name MY_SAMPLE \
    --out_dir ${OUTPUT} \
    -with-singularity \


run on local terminal but use files and directories


#install dorado (prebuilt binary)
mkdir ~/src
cd ~/src
wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz
tar -xvf dorado-0.3.4-linux-x64.tar.gz
#add to path in ~/.bashrc
#download dorado models
#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0
#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 #5khz

#install nextflow
cd ~/
curl -s "https://get.sdkman.io" | bash
source "$HOME/.sdkman/bin/sdkman-init.sh"
sdk install java 17.0.6-amzn
curl -s https://get.nextflow.io | bash
./nextflow self-update
mv ~/nextflow ~/tools #moving to in PATH

#install docker from https://www.docker.com/products/docker-desktop/ using apple chip
#run docker and it should be in the path
docker version
#connect to server both USR2 and seq
output_name="20230726_1239_2D_PAO38369_output" #change to each flowcell
pod5_dir="/Volumes/seq/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231/20230726_1239_2D_PAO38369_dde6ac95" #change to each flowcell

#connect through Finder to seq and USR2 (for reference genome.fa)

nextflow run epi2me-labs/wf-basecalling \
    --input $pod5_dir \
    --dorado_ext "pod5" \
    --output_bam \
    --cuda-device cuda:all \
    --ref $ref \
    --verbose \
    --out_dir ${wd_out}/${output_name} \
    --basecaller_cfg "dna_r10.4.1_e8.2_400bps_hac@v4.2.0" \
    --remora_cfg "dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2"

trying Higashi for scHiC

cd ~/tools 
git clone https://github.com/ma-compbio/Higashi/
cd Higashi
python setup.py install

ssh seadragon
bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=1:gmem=4 -M 16 -R rusage[mem=16] /bin/bash #get interactive gpu node

bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive transfer node this has internet access for environment set up
module load miniconda3/39_23.5.0; eval "$(/risapps/rhel7/miniconda3/py39_4.12.0/bin/conda shell.bash hook)"
module load cuda11.5/toolkit/11.5.1

conda create --name hic python=3.11 #installing to conda base
#install higashi from 4dn
conda activate hic
conda install -c conda-forge mamba
mamba install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
mamba install -c bioconda pybedtools cooler
conda install -c bioconda pybedtools

mamba install -c conda-forge zlib

mamba install -c ruochiz fasthigashi
# CUDA 10.2 (version of cuda in modules)
conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=10.2 -c pytorch

mkdir ~/tools
cd ~/tools 
git clone https://github.com/ma-compbio/Higashi/
cd Higashi
conda install python==3.9 numpy==1.16.5
python setup.py install

conda create –name hic conda activate hic mamba install -c ruochiz fasthigashi q

wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz

Download data via transfer node

bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive transfer node this has internet access for environment set up

#ONT data
rsync  \
-LPr mulqueen@ ~/projects/gccACT

#dorado prebuilt
rsync \
-LPr mulqueen@ ~/tools
#dorado reference genome
#download model for base calling
#wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz
#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 #5khz #cpg??
#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0
rsync \
-LPr mulqueen@ ~/
rsync \
-LPr mulqueen@ ~/

#nextflow epi2me download
rsync \
-LPr mulqueen@ ~/

#my references
rsync \
-LPr mulqueen@ ~/

Use CUDA node

bsub -Is -W 6:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 16 -R rusage[mem=16] /bin/bash #get interactive gpu node

module load singularity/3.7.0
module load nextflow/23.04.3
module load cuda11.5/toolkit/11.5.1
module load samtools/1.15 
#module load dorado
#module load wf-human-variation

nextflow run ./wf-human-variation-master/main.nf \
    -w ${OUTPUT}/workspace \
    --snp --sv --bam demo_data/demo.bam \
    --bed demo_data/demo.bed \
    --ref demo_data/demo.fasta \
    --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' \
    --sample_name MY_SAMPLE \
    --out_dir ${OUTPUT} \
    -with-singularity -without-docker

# If you want to start with a FASTQ, you can generate an unaligned BAM from FASTQ with samtools import http://www.htslib.org/doc/samtools-import.html and allow the workflow to take care of mapping.

#CPU Node Run
bsub -Is -W 6:00 -q medium -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive cpu node

#module load singularity/3.7.0
#module load nextflow/23.04.3
#module load cuda11.5/toolkit/11.5.1
module load samtools/1.15 
#module load dorado

output_name="20230726_1239_2D_PAO38369_output" #change to each flowcell
pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231/20230726_1239_2D_PAO38369_dde6ac95" #change to each flowcell

dorado basecaller \
    --verbose \
    --device cpu \
    --reference ${ref} \
    --emit-sam \
    --max-reads 100 \
    --batchsize 64 \
    'dna_r10.4.1_e8.2_400bps_hac@v4.2.0' \
    ${pod5_dir}/pod5_pass/ \

#untested for this part, but use bam from wf-basecalling output as input
nextflow run ~/wf-human-variation-master/main.nf \
    -w ${wd_out}/${output_name}/workspace \
    -profile singularity \
    --snp --sv --cnv --methyl \
    --ref ${ref} \
    --bam ${wd_out}/${output_name}.bam \
    --dorado_ext pod5 \
    --basecaller_basemod_threads 40 \
    --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.2.0'  \
    --remora_cfg 'dna_r10.4.1_e8.2_400bps_sup@v4.2.0_5mCG_5hmCG@v2' \
    --sample_name ${output_name} \
    --out_dir ${wd_out}/${output_name}/ \
    -with-singularity \

ssh r1prpsciapp13 
module purge
module load nextflow/23.04.3
module load singularity/3.7.0

#manually pull singularity image?
nextflow pull epi2me-labs/wf-human-variation
singularity pull --name ontresearch-wf-human-variation-shac4db03c19b6ff1277a24ec28a19e564d628d478f.img.pulling.1669977561040 docker://ontresearch/wf-human-variation:shac4db03c19b6ff1277a24ec28a19e564d628d478f
 > /dev/null

nextflow run ~/wf-human-variation-master/main.nf 
nextflow run ~/wf-human-variation-master/main.nf \
     -w ${OUTPUT}/workspace \
     -profile standard \
     --snp --sv --bam demo_data/demo.bam \
     --bed demo_data/demo.bed \
     --ref demo_data/demo.fasta \
     --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' \
     --sample_name MY_SAMPLE --out_dir ${OUTPUT} \
     -without-docker -with-singularity $sif_in

tar -xzf /risapps/tps_source/tps_source/wf-human-variation/demo_data.tar.gz
nextflow run wf-human-variation-master -w ${OUTPUT}/workspace -profile standard --snp --sv --bam demo_data/demo.bam --bed demo_data/demo.bed --ref demo_data/demo.fasta --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' --sample_name MY_SAMPLE --out_dir ${OUTPUT} -with-singularity -without-docker

bsub -q “gpu-medium” dorado_test.bsub