Note: This processing is exploratory and not the final processing used for the manuscript
This notebook describes processing of s3GCC samples following the splitting of a deduplicated bwa mem aligned bam file which occurs in “s3 Preprocessing”
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data")
#complexity data
crc_compl<-read.table("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/complexity_data/CRC-4442CRC-4671_GCC_H.complexity.txt",header=F)
colnames(crc_compl)<-c("row_no","cellID","total_reads","uniq_reads","perc_uniq")
cellline_compl<-read.table("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/complexity_data/CellLine_GCC_L.complexity.txt",header=F)
colnames(cellline_compl)<-c("row_no","cellID","total_reads","uniq_reads","perc_uniq")
compl<-rbind(crc_compl,cellline_compl)
#read in indexes master file
idx<-read.table("/home/groups/oroaklab/src/scitools/scitools-dev/SCI_stdchem_Indexes.txt",col.names=c("idx_name","idx_cycles","idx_seq"))
idx_i7<-idx[idx$idx_cycles==1,]
colnames(idx_i7)<-c("i7_idx_name","i7_idx_cycle","i7_idx_seq")
idx_i5<-idx[idx$idx_cycles==2,]
colnames(idx_i5)<-c("i5_idx_name","i5_idx_cycle","i5_idx_seq")
idx_tn5<-idx[idx$idx_cycles==3,]
colnames(idx_tn5)<-c("tn5_idx_name","tn5_idx_cycle","tn5_idx_seq")
#Per Plate breakdown of data contained as sheets in https://docs.google.com/spreadsheets/d/1mZ34KIwmr2vdjQlnqY7v_u0-Eca8mRc-HgI2r_WICXk/edit#gid=1305239528
#Not all plates are a proper barnyard mix.
crc_gcc_pcr_annot<-read.table("S3GCC_crc_pcr_annotation.txt",sep="\t",header=T)
celline_gcc_tn5_annot<-read.table("S3WGSandGCC_cellline_tn5_annotation.txt",sep="\t",header=T)
cellline_cellID<-read.table("s3gcc_cellline.bbrd.q10.filt.cellIDs.list",header=F)
colnames(cellline_cellID)<-c("cellID")
crc_cellID<-read.table("s3gcc_crc.bbrd.q10.filt.cellIDs.list",header=F)
colnames(crc_cellID)<-c("cellID")
cellID<-rbind(crc_cellID,cellline_cellID)
cellID$i7_idx_seq<-substr(cellID$cellID,0,8)
cellID$tn5_idx_seq<-substr(cellID$cellID,17,24)
cellID$i5_idx_seq<-substr(cellID$cellID,9,16)
dat<-merge(cellID,idx_i7,by="i7_idx_seq")
dat<-merge(dat,idx_tn5,by="tn5_idx_seq")
dat<-merge(dat,idx_i5,by="i5_idx_seq")
dat_cellline<-merge(dat,celline_gcc_tn5_annot,by="tn5_idx_seq")
dat_crc<-merge(dat,crc_gcc_pcr_annot,by=c("i5_idx_seq","i7_idx_seq","i5_idx_cycle","i7_idx_cycle","i5_idx_name","i7_idx_name"))
dat_cellline<-dat_cellline[c("cellID","i7_idx_seq","i5_idx_seq","tn5_idx_seq","i7_idx_name","i5_idx_name","tn5_idx_name","sample")]
dat_crc<-dat_crc[c("cellID","i7_idx_seq","i5_idx_seq","tn5_idx_seq","i7_idx_name","i5_idx_name","tn5_idx_name","sample")]
dat<-rbind(dat_cellline,dat_crc)
dat$plate<-unlist(lapply(strsplit(dat$i5_idx_name,"_"),"[",2))
dat<-merge(dat,compl,by="cellID")
write.table(dat,file="s3gcc_cellsummary.tsv",col.names=T,sep="\t")
library(ggplot2)
plt<-ggplot(dat,aes(x=sample,y=log10(uniq_reads),color=as.factor(sample)))+geom_boxplot()+geom_jitter()
ggsave(plt,file="s3gcc_uniqreads.png")
#Extract distal reads based on >=1kb
#or transchromosomal on different chromosomes
#printing out distal connections (>=1kb or different chromosomes) using awk
#Note this is for read projection but they are not used for the actual s3GCC analysis
dir = "/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/raw_alignment"
for i in CellLine_GCC_L.nsrt.bam CRC-4442CRC-4671_GCC_H.nsrt.bam;
do \
out_name=${i::-9};
echo $out_name;
#print out reads on same chromosome that are >= 1000 bp apart (distal)
((samtools view -H $i) & (samtools view $i | awk '{if (sqrt(($9^2))>=1000) print $0}')) | samtools view -bS - > $out_name.distal.bam &
#print out reads on different chromosomes
((samtools view -H $i) & (samtools view $i | awk '{if($7 != "=") print $0}')) | samtools view -bS - > $out_name.transchr.bam &
#print out reads on same chromosome within 1000bp (cis)
((samtools view -H $i) & (samtools view $i | awk '{if (sqrt(($9^2))<=1000) print $0}')) | samtools view -bS - > $out_name.cis.bam &
done &
#Processing of splitting bam files is similar to how wgs cells were treated in s3wgs notebook.
#moved files to s3gcc_data
mv CRC-4442CRC-4671_GCC_H.cis.bam CRC-4442CRC-4671_GCC_H.distal.bam CRC-4442CRC-4671_GCC_H.transchr.bam ../s3gcc_data/
#filter bam files to just cellIDs which pass QC
DIR="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis"
scitools bam-filter -L $DIR/s3gcc_data/s3gcc_crc.bbrd.q10.filt.cellIDs.list -O $DIR/s3gcc_data/s3gcc_crc_H.cis $DIR/s3gcc_data/CRC-4442CRC-4671_GCC_H.cis.bam &
scitools bam-filter -L $DIR/s3gcc_data/s3gcc_crc.bbrd.q10.filt.cellIDs.list -O $DIR/s3gcc_data/s3gcc_crc_H.distal $DIR/s3gcc_data/CRC-4442CRC-4671_GCC_H.distal.bam &
scitools bam-filter -L $DIR/s3gcc_data/s3gcc_crc.bbrd.q10.filt.cellIDs.list -O $DIR/s3gcc_data/s3gcc_crc_H.trans $DIR/s3gcc_data/CRC-4442CRC-4671_GCC_H.transchr.bam &
#Split bams by cell line
for i in s3gcc_crc_H.cis.filt.bam s3gcc_crc_H.distal.filt.bam s3gcc_crc_H.trans.filt.bam;
do scitools bam-split -A sample.annot $i & done
#ensure bam files are properly mate paired
for i in s3gcc_crc_H.cis.filt.crc4442.bam s3gcc_crc_H.distal.filt.crc4442.bam \
s3gcc_crc_H.distal.filt.crc4671.bam s3gcc_crc_H.trans.filt.crc4442.bam \
s3gcc_crc_H.cis.filt.crc4671.bam s3gcc_crc_H.trans.filt.crc4671.bam ;
do samtools sort -@ 10 -n $i | samtools fixmate - - | samtools view -@ 10 -b -f 2 -F 524 - > ${i::-4}.matefixed.bam ; done &
#perform read projection
for i in s3gcc_crc_H.cis.filt.crc4442.matefixed.bam s3gcc_crc_H.cis.filt.crc4671.matefixed.bam s3gcc_crc_H.distal.filt.crc4442.matefixed.bam s3gcc_crc_H.distal.filt.crc4671.matefixed.bam s3gcc_crc_H.trans.filt.crc4442.matefixed.bam s3gcc_crc_H.trans.filt.crc4671.matefixed.bam;
do scitools bam-project -r 500 -n 1 -X -e $i ; done &
#Collate projections for plotting
cd /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data
#read projections
#distal
for i in s3gcc_crc_H.cis.filt.crc4442.matefixed.read_projections s3gcc_crc_H.cis.filt.crc4671.matefixed.read_projections s3gcc_crc_H.distal.filt.crc4442.matefixed.read_projections s3gcc_crc_H.trans.filt.crc4442.matefixed.read_projections s3gcc_crc_H.trans.filt.crc4671.matefixed.read_projections
do line=${i:21:-27};
mate_type=${i:12:-40};
name=$line"_"$mate_type
awk -v name=$name 'OFS="\t" {print $1,$2,$3,name}' $i/cell_summaries.txt;
done >> ./s3gcc_projected_reads.txt
library(ggplot2)
library(dplyr)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data")
proj<-read.table("s3gcc_projected_reads.txt",header=F)
colnames(proj)<-c("proj_perc_uniq","proj_reads","proj_unique_reads","sample_read_type")
dat<- as.data.frame(proj %>%
group_by(sample_read_type) %>%
summarize(mean=mean(log10(proj_unique_reads/2)),
sd=sd(log10(proj_unique_reads/2)),
median=median(log10(proj_unique_reads/2))))
dat[dat$proj_perc_uniq==0.05,]
# sample assay proj_perc_uniq read_type mean sd median
#1 crc4442 s3gcc 0.05 cis_distal 3.199874 0.4360882 3.131619
#2 crc4442 s3gcc 0.05 cis_proximal 5.443474 0.3253194 5.345948
#3 crc4442 s3gcc 0.05 trans 3.238635 0.2888201 3.170702
#284 crc4671 s3gcc 0.05 cis_distal 3.492634 0.3608255 3.511349
#285 crc4671 s3gcc 0.05 cis_proximal 5.649266 0.3503700 5.614836
#286 crc4671 s3gcc 0.05 trans 3.280166 0.3216718 3.289254
ggplot(dat,aes(x=as.numeric(proj_perc_uniq),fill = paste(sample,assay,read_type),color=paste(sample,assay,read_type)))+
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd),alpha=0.1,color=NA) +
geom_line(aes(y=mean,linetype="solid"))+
geom_line(aes(y=median,linetype="dashed")) +
theme_bw() + scale_x_reverse()+ylim(c(0,6.5))
ggsave(file="gcc_projected_contacts.png")
ggsave(file="gcc_projected_contacts.pdf")
system("slack -F gcc_projected_contacts.png ryan_todo")
system("slack -F gcc_projected_contacts.pdf ryan_todo")
#For splitting to single cell bam files (NOT RUN)
#add read groups to each bam file
for i in s3gcc_crc_H.trans.filt.bam s3gcc_crc_H.distal.filt.bam s3gcc_crc_H.cis.filt.bam; do scitools bam-addrg $i & done
#cis is still running
#cellID is few enough that there are no IO errors, split by RG
for i in s3gcc_crc_H.trans.filt.RG.bam s3gcc_crc_H.distal.filt.RG.bam s3gcc_crc_H.cis.filt.RG.bam; do samtools split -@10 -f './single_cell_projections/%*_%!.%.' $i ; done & #perform samtools split on RG (cellIDs)
#perform parallelized duplicate removal
find . -type f -name '*matefixed.bam' | parallel -j 20 scitools bam-rmdup {}
#perform parallelized read projection
find . -type f -name '*matefixed.bam' | parallel -j 20 scitools bam-project -r 500 -n 1 -X -e {}
cd /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data
#read projections
#distal
for i in ./single_cell_projections/projections/*distal*matefixed.read_projections;
do cellid=${i:38:-17};
awk -v cellid=$cellid 'OFS="\t" {print $1,$2,$3,cellid,"cis_distal"}' $i/cell_summaries.txt;
done > ./s3gcc_distal_projected_reads.txt
#cis
for i in ./single_cell_projections/projections/*cis*matefixed.read_projections;
do cellid=${i:38:-17};
awk -v cellid=$cellid 'OFS="\t" {print $1,$2,$3,cellid,"cis_proximal"}' $i/cell_summaries.txt;
done > ./s3gcc_cis_projected_reads.txt
#trans
for i in ./single_cell_projections/projections/*trans*matefixed.read_projections;
do cellid=${i:38:-17};
awk -v cellid=$cellid 'OFS="\t" {print $1,$2,$3,cellid,"trans"}' $i/cell_summaries.txt;
done > ./s3gcc_trans_projected_reads.txt
import os
wd="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/"
sc_dir="gcc_cisdistal_triplesparse/"
if not os.path.exists(wd+sc_dir):
os.makedirs(wd+sc_dir)
print("Directory created or already present.")
Triple-sparse format will be the input for scHiCluster. This script will take in a bam file, split by unique identifiers in the read name, and then move to the new working directory. Output will be a triple-sparse format file for each cell id, for each chromosome. It will also make the network file, which lists full path to each cell ID (needed for scHiCluster).
Currently works at a 1mb resolution.
import glob
import pysam
from collections import defaultdict
from collections import Counter
import pandas as pd
import os
import time
def read_pair_generator(bam):
"""
Generate read pairs in a BAM file or within a region string.
Reads are added to read_dict until a pair is found.
"""
read_dict = defaultdict(lambda: [None, None])
for read in bam.fetch(until_eof=True):
qname = read.query_name
if qname not in read_dict:
if read.is_read1:
read_dict[qname][0] = read
else:
read_dict[qname][1] = read
else:
if read.is_read1:
yield read, read_dict[qname][1]
else:
yield read_dict[qname][0], read
del read_dict[qname]
bam_dir="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/filtered_bam/" #working directory
bam_bulk=["CRC-4442CRC-4671_GCC_H.bbrd.q10.filt.bam"] #list of bam files
#Takes read 1 and read 2 paired by query name with the above function
#then rounds read 1 and read 2 starts to the nearest 1mb and takes that as the bin (drops the 6 0s for the bin names)
out_dict = defaultdict(dict)
chr_count=[]
i=0 #Counter just to check progress
start_time = time.time() #And set up a timer
#cis interactions first
for bulk in bam_bulk:
cellline=bulk.split(".")[0]
bam = pysam.AlignmentFile(bam_dir+bulk, 'rb')
for read1, read2 in read_pair_generator(bam): #set paired reads together
if read1 is not None and read2 is not None: #remove reads which dont pair
if read1.reference_id==read2.reference_id: #ensure chromosomes are same
if abs(read1.template_length)>=50000: #ignore reads <50kbp in length
i+=1
if i % 100000 == 0:
print("Processed "+str(i)+" distal cis reads in "+str(time.time()-start_time)+" seconds.")
if read1.reference_start < 1000000: #bin for 0 (less than 1mbp into chr)
r1_bin="0"
else:
r1_bin=str(read1.reference_start)[:-6] #bin for anything greater than 1mb, just cut last 6 digits off
if read2.reference_start < 1000000:
r2_bin="0"
else:
r2_bin=str(read2.reference_start)[:-6]
if int(r2_bin) < int(r1_bin): #quick sorting of bins
tmp=r2_bin
r2_bin=r1_bin
r1_bin=tmp
chr_out=bam.get_reference_name(read1.reference_id) #set chr name
chr_count.append(chr_out) #make sure chr name was seen before or added
cellid_out=read1.query_name.split(":")[0] #assign read to cellID
cellid_out=cellline+"_"+cellid_out
if cellid_out not in out_dict: #add any new cellIDs
out_dict[cellid_out][chr_out]=[[r1_bin,r2_bin]] #build out dict
if chr_out not in out_dict[cellid_out]:
out_dict[cellid_out][chr_out]=[[r1_bin,r2_bin]]
else:
out_dict[cellid_out][chr_out].append([r1_bin,r2_bin])
chr_count=len(Counter(chr_count))
wd="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/" #working directory
sc_dir="gcc_cisdistal_triplesparse/"
with open(wd+"scHiCluster.network","w") as fout:
for cellid_out in out_dict: #generate sciHiCluster.network file (list of path/cellname)
fout.write(wd+sc_dir+cellid_out+"\n")
for cellid_out in out_dict: #generate [cellID].[chr].txt files in triple sparse format
for chr_out in out_dict[cellid_out]:
with open(wd+sc_dir+cellid_out+"_"+chr_out+".txt","w") as fout:
dict = Counter([tuple(i) for i in out_dict[cellid_out][chr_out]])
# Creating pandas dataframe
Output = pd.DataFrame(data ={'bin1': list([item[0] for item in list(dict.keys())]),'bin2': list([item[1] for item in list(dict.keys())]),'count': list(dict.values())})
Output.to_csv(fout,sep="\t",header=False,index=False)
import glob
import pysam
from collections import defaultdict
from collections import Counter
import pandas as pd
import os
import time
def read_pair_generator(bam):
"""
Generate read pairs in a BAM file or within a region string.
Reads are added to read_dict until a pair is found.
"""
read_dict = defaultdict(lambda: [None, None])
for read in bam.fetch(until_eof=True):
qname = read.query_name
if qname not in read_dict:
if read.is_read1:
read_dict[qname][0] = read
else:
read_dict[qname][1] = read
else:
if read.is_read1:
yield read, read_dict[qname][1]
else:
yield read_dict[qname][0], read
del read_dict[qname]
bam_dir="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/" #working directory
bam_bulk=["s3gcc_crc.bbrd.q10.filt.bam","s3gcc_cellline.bbrd.q10.filt.bam"] #list of bam files
#Takes read 1 and read 2 paired by query name with the above function
#then rounds read 1 and read 2 starts to the nearest 1mb and takes that as the bin (drops the 6 0s for the bin names)
distal_count={}
trans_count={}
local_count={}
#cis distal and trans interactions counting
for bulk in bam_bulk:
cellline=bulk.split(".")[0]
bam = pysam.AlignmentFile(bam_dir+bulk, 'rb')
for read1, read2 in read_pair_generator(bam): #set paired reads together
if read1 is not None and read2 is not None: #remove reads which dont pair
cellid_out=read1.query_name.split(":")[0]
if read1.reference_id==read2.reference_id: #ensure chromosomes are same
if abs(read1.template_length)>=1000: #ignore reads <1kbp in length
if cellid_out in distal_count:
distal_count[cellid_out]+=1
else:
distal_count[cellid_out]=1
else:
if cellid_out in local_count:
local_count[cellid_out]+=1
else:
local_count[cellid_out]=1
if read1.reference_id!=read2.reference_id: #ensure chromosomes are same
if cellid_out in trans_count:
trans_count[cellid_out]+=1
else:
trans_count[cellid_out]=1
distal_df=pd.DataFrame.from_dict(distal_count,orient="index",columns=["distal_cis_interactions"])
distal_df.index.name = 'cellID'
distal_df.reset_index(inplace=True)
trans_df=pd.DataFrame.from_dict(trans_count,orient="index",columns=["trans_interactions"])
trans_df.index.name = 'cellID'
trans_df.reset_index(inplace=True)
local_df=pd.DataFrame.from_dict(local_count,orient="index",columns=["local_cis"])
local_df.index.name = 'cellID'
local_df.reset_index(inplace=True)
df = pd.merge(left=distal_df, right=trans_df, left_on='cellID', right_on='cellID')
df = pd.merge(left=df, right=local_df, left_on='cellID', right_on='cellID')
df.to_csv(bam_dir+'s3gcc_hiccontacts.csv', index=False)
This script will run scHiCluster, with a harded length of hg38 chromosomes on the files produced from the script above.
It is currently set to generate 3 clusters (via K Means clustering). sciHiCluster source and example processing is available here.
import time
import numpy as np
from schicluster import *
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics.cluster import adjusted_rand_score as ARI
import csv
import glob
import pandas as pd
wd="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/" #working directory
sc_dir="gcc_cisdistal_triplesparse/"
network_file=wd+"scHiCluster.network"
network = open(network_file).read().splitlines()
network = np.loadtxt(network_file,dtype=np.str)
#sizes (in bp) of autosomes in numerical order taken from UCSC hg38
hg38dim=[248956422,242193529,198295559,190214555,181538259,170805979,159345973,145138636,138394717,133797422,135086622,133275309,114364328,107043718,101991189,90338345,83257441,80373285,58617616,64444167,46709983,50818468] #this is all autosomes, but I'm just using chr1-16 for now (others too sparse so it throws errors)
chrom = [str(i+1) for i in range(22)] #only using autosomes 1-22 due to sparsity
chromsize = {chrom[i]:hg38dim[i] for i in range(len(chrom))} #generate dicitonary of chromosome names and sizes
nc=5 #5 clusters?
chr_count = []
chr_names=["chr"+s for s in chrom]
#need to filter cells with less than a certain number of reads per chr
#building a pandas data frame for count per chr
for cellID in network:
cellID_out=cellID.split("/")[-1]
for file in glob.glob(cellID+"*"+".txt"):
chr_out=file.split("/")[-1].split("_")[-1].split(".")[0]
num_lines = sum(1 for line in open(file))
chr_count.append([cellID_out,chr_out,num_lines])
chr_count=pd.DataFrame(chr_count,columns=["cellID","chr_out","contact_count"])
filter_list=[]
for i in chr_count.cellID.unique():
tmp=chr_count.loc[chr_count['cellID'] == i]
tmp=tmp[tmp["chr_out"].isin(chr_names)]
if len(tmp)==len(chr_names) and tmp.contact_count.min()>1:
filter_list.append(i)
#filter network to cells with atleast 2 reads per chr
#goal here is to group sparse cells for more thorough analysis
network_fil=["/".join(network[0].split("/")[0:-1])+"/"+i for i in filter_list] #filtering the network list to those that pass the count filter
len(network_fil)
label=["_".join(i.split("/")[-1].split("_")[0:2]) for i in network_fil]
start_time = time.time()
cluster,embedding=hicluster_cpu(network_fil,chromsize,nc=nc, pad=1, rp=0.5, prct=20, ncpus=20)
print(time.time() - start_time)
[ARI(label, KMeans(n_clusters = nc, n_init = 200).fit(embedding[:, :ndim]).labels_) for ndim in [2,5,10,20,30,40,50]]
out_dir=wd
#output annoted clusters
with open(wd+"scHiCluster.annot","w") as fout:
for i in range(0,len(cluster)-1):
fout.write(filter_list[i]+"\t"+str(cluster[i])+"\n")
print(filter_list[i]+"\t"+str(cluster[i]))
#output embeddings
embed=pd.DataFrame(embedding,index=filter_list)
with open(wd+"scHiCluster.dims","w") as fout:
embed.to_csv(fout,sep="\t",header=True,index=True)
Next we will load in the output PCA dims file from sciHiCluster and use UMAP on a subset of PCs which define the most variance. For this, I’m just going to use an elbow plot to define the cutoff.
library(ggplot2)
library(gridExtra)
library(Rphenograph)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data")
dat<-read.table("scHiCluster.dims",header=T,row.names=1)
scHiClus<-read.table("scHiCluster.annot",header=F)
colnames(scHiClus)<-c("fullname","scHiClus")
dat_var<-data.frame(pc=seq(1,ncol(dat)),perc_var=(sapply(dat, var)/sum(sapply(dat,var)))*100)
dat_var$cum_perc<-0
dat_var$cum_perc<-unlist(lapply(1:nrow(dat_var),function(x) sum(dat_var[seq(1,x),]$perc_var)))
plt<-ggplot(dat_var,aes(x=pc,y=perc_var))+geom_line()+theme_minimal()+geom_vline(aes(xintercept =7,color="red"))
ggsave(plt,file="scHiCluster_pc_varexplained.pdf")
system("slack -F scHiCluster_pc_varexplained.pdf ryan_todo")
annot<-read.table("scHiCluster.annot",header=F)
colnames(annot)<-c("cellID","kmean_clus")#can also just dbscan or pgclus it myself
annot_samp<-read.table("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3wgs_data/s3wgs_gcc.cellsummary.txt",header=T)
umap_dat<-function(x){
dat_dims<-as.data.frame(uwot::umap(dat[1:x]))
row.names(dat_dims)<-row.names(dat)
dat_dims$fullname<-row.names(dat_dims)
dat_dims$cellID_idx<-unlist(lapply(strsplit(dat_dims$fullname,"_"),"[",4))
dat_dims<-merge(dat_dims,annot_samp,by="cellID_idx")
plt<-ggplot(dat=dat_dims,aes(x=V1,y=V2,color=as.factor(sample),size=1))+geom_point()+theme_bw()+ggtitle(paste("UMAP of",x,"PCs"))
return(plt)
}
pc_dat<-function(x){
dat_dims<-as.data.frame(uwot::umap(dat[c(x,x+1)]))
row.names(dat_dims)<-row.names(dat)
dat_dims$fullname<-row.names(dat_dims)
dat_dims$cellID_idx<-unlist(lapply(strsplit(dat_dims$fullname,"_"),"[",4))
dat_dims<-merge(dat_dims,annot_samp,by="cellID_idx")
plt<-ggplot(dat=dat_dims,aes(x=V1,y=V2,color=as.factor(sample),size=1))+geom_point()+theme_bw()+ggtitle(paste("PC",x,"and",x+1))
return(plt)
}
plt_list<-lapply(c(3,4,5,6,7,8,9,10,20,30,50,75,100,ncol(dat)),umap_dat)
ncol=4
out_plt<-do.call("grid.arrange", c(plt_list, ncol=ncol))
ggsave(out_plt,file="scHiCluster_umap_pctest.svg",height=20,width=40,limitsize=FALSE)
ggsave(out_plt,file="scHiCluster_umap_pctest.png",height=20,width=40,limitsize=FALSE)
system("slack -F scHiCluster_umap_pctest.png ryan_todo")
plt_list<-lapply(c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),pc_dat)
ncol=4
out_plt<-do.call("grid.arrange", c(plt_list, ncol=ncol))
ggsave(out_plt,file="scHiCluster_pca_pctest.svg",height=30,width=40,limitsize=FALSE)
ggsave(out_plt,file="scHiCluster_pca_pctest.png",height=30,width=40,limitsize=FALSE)
system("slack -F scHiCluster_pca_pctest.png ryan_todo")
#selecting x PCs to use
x=10
kmeans_out <- as.data.frame(kmeans(dat[1:x], center = 5)$cluster)
kmeans_out$fullname<-row.names(kmeans_out)
dat_dims<-as.data.frame(uwot::umap(dat[1:x]))
row.names(dat_dims)<-row.names(dat)
dat_dims$fullname<-row.names(dat_dims)
dat_dims$cellID_idx<-unlist(lapply(strsplit(dat_dims$fullname,"_"),"[",4))
dat_dims<-merge(dat_dims,kmeans_out,by="fullname")
colnames(dat_dims)[ncol(dat_dims)]<-"kmeanscluster"
dat_dims<-merge(dat_dims,scHiClus,by="fullname")
dat_dims<-merge(dat_dims,annot_samp,by="cellID_idx")
dat_dims$mapq20<-as.numeric(dat_dims$mapq20)
plt1<-ggplot(dat=dat_dims,aes(x=V1,y=V2,color=log10(mapq20)))+geom_point()+theme_bw()+ggtitle("Read Depth")
plt2<-ggplot(dat=dat_dims,aes(x=V1,y=V2,color=as.factor(kmeanscluster)))+geom_point()+theme_bw()+ggtitle(paste("PG Cluster on",x,"PCs"))
plt3<-ggplot(dat=dat_dims,aes(x=V1,y=V2,color=as.factor(scHiClus)))+geom_point()+theme_bw()+ggtitle(paste("scHiClusters on",x,"PCs"))
plt4<-ggplot(dat=dat_dims,aes(x=V1,y=V2,color=as.factor(sample)))+geom_point()+theme_bw()+ggtitle("Cell Line")
ncol=2
out_plt<-grid.arrange(plt1,plt2,plt3,plt4, ncol=ncol)
ggsave(out_plt,file="scHiCluster_umap.pdf",width=10)
ggsave(out_plt,file="scHiCluster_umap.png",width=10)
system("slack -F scHiCluster_umap.pdf ryan_todo")
write.table(dat_dims,file="s3gcc_fullsummary.txt",sep="\t",quote=F,col.names=T,row.names=F)
for (i in unique(dat_dims$scHiClus)){
out_dir="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/gcc_cisdistal_triplesparse/"
scHiclus_net_temp<-dat_dims[dat_dims$scHiClus==i,]
scHiclus_net_temp<-as.data.frame(cbind(scHiclus_net_temp$fullname))
scHiclus_net_temp$V1<-paste0(out_dir,scHiclus_net_temp$V1)
write.table(scHiclus_net_temp,file=paste0("network_",i,".txt"),col.names=F,row.names=F,quote=F)
}
import time
import numpy as np
from schicluster import *
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics.cluster import adjusted_rand_score as ARI
import csv
import glob
import pandas as pd
network_file_list=glob.glob("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/for_gurkan/network*txt")
chromsize_file="/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/chromLengths.txt"
hg38dim=[248956422,242193529,198295559,190214555,181538259,170805979,159345973,145138636,138394717,133797422,135086622,133275309,114364328,107043718,101991189,90338345,83257441,80373285,58617616,64444167,46709983,50818468] #this is all autosomes, but I'm just using chr1-16 for now (others too sparse so it throws errors)
chrom = [str(i+1) for i in range(16)]
chromsize = {chrom[i]:hg38dim[i] for i in range(len(chrom))}
chr_count = []
chr_names=["chr"+s for s in chrom]
res=1000000
#this had to be corrected to work for CPU
#also added a loop for single cell topdom
def merge_cpu(network, c, res, pad=1, rp=0.5, prct=-1,chromsize=chromsize):
ngene = int(chromsize[c] / res) + 1
start_time = time.time()
Q_sum = np.zeros(ngene * ngene)
for cell in network:
Q_sum = Q_sum + impute_cpu([cell, c, ngene, pad, rp])[1]
end_time = time.time()
print('Load and impute chromosome', c, 'take', end_time - start_time, 'seconds')
Q_sum = Q_sum.reshape(ngene, ngene)
return Q_sum
#this also had to be fixed with chromsize as variable
def output_topdom(cell, c, Q, res, chromsize=chromsize):
ngene, _ = Q.shape
B = [['chr' + c, i * res, (i + 1) * res] for i in range(ngene)]
B[-1][-1] = chromsize[c]
C = np.concatenate((B, Q), axis=1)
np.savetxt(cell + '_chr' + c + '.topdommatrix', C, fmt = '%s', delimiter = '\t')
return
for network_file in network_file_list:
network = open(network_file).read().splitlines()
network = np.loadtxt(network_file,dtype=np.str)
cell=network_file.split(".")[0]
print("Processing:"+cell)
for c in chromsize: #for whole network merged file
Q = merge_cpu(network, c, res=1000000)
output_topdom(cell, c, Q, res)
output_sparse(cell, c, Q, res)
for network_file in network_file_list:
print(network_file)
for cell in open(network_file).read().splitlines():
for c in chromsize:
ngene = int(chromsize[c] / res) + 1
start_time = time.time()
pad=1
rp=0.5
Q = np.zeros(ngene * ngene)
print("Processing"+" "+cell+" "+c)
Q=impute_cpu([cell, c, ngene,pad,rp])[1]
end_time = time.time()
Q = Q.reshape(ngene, ngene)
output_topdom(cell,c, Q, res)
Note: as of now this script runs through each single cell as well. This isn’t particularly useful, but it might be dependent on future analysis.
#R v4.0
#remotes::install_github("HenrikBengtsson/TopDom", ref="master")
library(TopDom)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/gcc_cisdistal_triplesparse")
network_files<-list.files(path="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/gcc_cisdistal_triplesparse",pattern="*topdommatrix")
for (i in network_files){
skip_to_next<-FALSE
print(paste("Processing file:",i))
tad=tryCatch(TopDom(data = i, window.size = 5),error=function(e){skip_to_next<<-TRUE})
if(skip_to_next){
next
} else{
out_name=strsplit(i,"[.]")[[1]][1]
out_name=paste0(out_name,".w5.domain")
write.table(tad$bed[1:dim(tad$bed)[1],2:4], file=out_name, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)}
}
library(ggplot2)
library(ComplexHeatmap)
library(reshape2)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/gcc_cisdistal_triplesparse")
files_in<-list.files(pattern="_chr16.txt$")
x<-files_in[1]
plot_singlechr<-function(x){
name_out<-paste0(basename(x),".inter.hic.pdf")
dat<-read.table(x,head=F,sep="\t")
dat$V1<-dat$V1+1 #make it start counting at 1 rather than 0
dat$V2<-dat$V2+1
for (i in 1:nrow(dat)){
if (dat[i,]$V2 < dat[i,]$V1) {
temp<-dat[i,]$V1
dat[i,]$V1 <- dat[i,]$V2
dat[i,]$V2 <- temp
}
}
#create all the country combinations
df <- expand.grid(1:max(dat$V1,dat$V2), 1:max(dat$V1,dat$V2))
#change names
colnames(df) <- c('V1', 'V2')
#add a value of 0 for the new combinations (won't affect outcome)
df$V3 <- 0
#row bind with original dataset
df <- rbind(df, dat)
dat_cast<-as.data.frame(xtabs( V3 ~ V1 + V2, aggregate(V3~V1+V2,df,sum)))
dat_cast<-dcast(data=dat_cast,formula=V1~V2,value.var="Freq",fun.aggregate=sum)
dat_cast<-dat_cast[2:ncol(dat_cast)]
#dat_cast<-log10(dat_cast)
#dat_cast[dat_cast <= -Inf] <- 0
plt<-Heatmap(dat_cast,column_order=1:ncol(dat_cast),row_order=1:nrow(dat_cast),col=c("white","red"))
pdf(name_out)
print(plt)
dev.off()
#if(max(dat_cast)>1.5){
system(paste0("slack -F ",name_out," ryan_todo"))#}
}
lapply(files_in,plot_singlechr)
#writing out SourceData
out_source<-lapply(c("CRC-4442CRC-4671_GCC_H_GGTTAGTTAAGATCAGTTCCTGTT_chr16.txt","CRC-4442CRC-4671_GCC_H_ATTGAGGATGACGCGATAATACAG_chr16.txt","CRC-4442CRC-4671_GCC_H_ATTGAGGACCTGCGTACAGTAGGC_chr16.txt"), function(x){
name_out<-paste0(basename(x),".inter.hic.pdf")
dat<-read.table(x,head=F,sep="\t")
dat$V1<-dat$V1+1 #make it start counting at 1 rather than 0
dat$V2<-dat$V2+1
for (i in 1:nrow(dat)){
if (dat[i,]$V2 < dat[i,]$V1) {
temp<-dat[i,]$V1
dat[i,]$V1 <- dat[i,]$V2
dat[i,]$V2 <- temp
}
}
#create all the country combinations
df <- expand.grid(1:max(dat$V1,dat$V2), 1:max(dat$V1,dat$V2))
#change names
colnames(df) <- c('V1', 'V2')
#add a value of 0 for the new combinations (won't affect outcome)
df$V3 <- 0
#row bind with original dataset
df <- rbind(df, dat)
dat_cast<-as.data.frame(xtabs( V3 ~ V1 + V2, aggregate(V3~V1+V2,df,sum)))
dat_out<-dat_cast
dat_out$cellID<-basename(x)
dat_cast<-dcast(data=dat_cast,formula=V1~V2,value.var="Freq",fun.aggregate=sum)
dat_cast<-dat_cast[2:ncol(dat_cast)]
plt<-Heatmap(dat_cast,column_order=1:ncol(dat_cast),row_order=1:nrow(dat_cast),col=c("white","red"))
pdf(name_out)
print(plt)
dev.off()
system(paste0("slack -F ",name_out," ryan_todo"))#}
return(dat_out)
})
out_source<-do.call("rbind",out_source)
out_source$cellid<-unlist(lapply(strsplit(out_source$cellID,"_"),"[",4))
out_source$chr<-unlist(lapply(strsplit(out_source$cellID,"_"),"[",5))
write.table(out_source,file="SourceData_Fig5c.txt",sep="\t",row.names=F,quote=F)
system("slack -F SourceData_Fig5c.txt ryan_todo")
Input:
domlist: a list of all the topdom output. cluster: the cluster assignment of each cell with the same order of domlist. celltypelist: a list of all possible cell type label. res: the resolution of domains. chrom: the chromosome.
Returns:
sc_dom: domain boundaries in each single cell indicated by a binary matrix. dom_prob: domain boundary frequency in each cluster with the same order as in celltypelist. bins: The tested bins, since the bins with 0.0 or 1.0 domain frequency in any of the clusters were not tested. pvalue: The p-value of each bin in bins.
import time
import numpy as np
from schicluster import *
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics.cluster import adjusted_rand_score as ARI
import csv
import glob
import pandas as pd
import re
from statsmodels.sandbox.stats.multicomp import multipletests as FDR
#using this filter function for subsetting lists
def Filter(string, substr):
return [str for str in string
if any(sub in str for sub in substr)]
#diff domain call from scihicluster, pretty extensively modified to work properly
def diff_dom(args):
domlist, cluster, ctlist, res, c, chromsize = args
start_time = time.time()
ctdict = {x:i for i,x in enumerate(ctlist)}
ngene = int(chromsize[c] / res) + 1
bound = np.zeros((len(cluster), ngene))
for i,cell in enumerate(domlist):
domain = np.loadtxt(cell, dtype = np.str)
C = domain[domain[:,-1]=='domain', :2].astype(int) / res
bound[i, np.array(list(set(C.flatten())),dtype=int)] += 1
cellfilter = (np.sum(bound, axis=1)>0)
count = np.array([np.sum(np.logical_and([k==x for x in cluster], cellfilter)) for k in ctlist])
prob = np.array([np.sum(bound[[x==k for x in cluster]], axis=0) for k in ctlist])
ptmp, btmp = [],[]
for i in range(ngene):
contig = [[prob[j,i], count[j]-prob[j,i]] for j in range(len(ctlist))]
if np.sum(np.sum(contig, axis=0)==0)==0:
ptmp.append(chi2_contingency(contig)[1])
btmp.append('\t'.join([c, str(i*res), str(i*res+res)]))
print(c, time.time()-start_time)
return [bound, prob/count[:,None], btmp, ptmp]
network_file_list=glob.glob("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/network*txt")
domlist=glob.glob("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/gcc_cisdistal_triplesparse/*w5.domain")
chromsize_file="/home/groups/oroaklab/adey_lab/projects/sciWGS/Public_Data/chromLengths.txt"
hg38dim=[248956422,242193529,198295559,190214555,181538259,170805979,159345973,145138636,138394717,133797422,135086622,133275309,114364328,107043718,101991189,90338345,83257441,80373285,58617616,64444167,46709983,50818468] #this is all autosomes, but I'm just using chr1-16 for now (others too sparse so it throws errors)
chrom = [str(i+1) for i in range(16)]
chromsize = {chrom[i]:hg38dim[i] for i in range(len(chrom))}
chr_count = []
chr_names=["chr"+s for s in chrom]
res=1000000
for chrom in chromsize:
chrom_filt_name="_chr"+chrom+"." #set up chr name
domlist_chr=[i for i in domlist if chrom_filt_name in i] #first filter topdom list to the chromosome
domlist_cell_order=[] #initiate empty topdom list for each chr
cluster_list=[] #initiate empty cluster list for each chr
for cluster in network_file_list: #loop through each cluster network file
clus_name=[cluster.split("/")[-1].split(".")[0]] #set cluster name
cluster_list=cluster_list+(clus_name)*len(open(cluster).read().splitlines()) #make a list of cluster assignment per cell by multiplying cluster name by number of cells
domlist_cell_order=domlist_cell_order+Filter(domlist_chr,open(cluster).read().splitlines()) #make list of topdoms with appropriate cluster assignment and chr
celltype_list=list(set(cluster_list)) #set celltype_list by unique celltypes
sc_dom, dom_prob, bins, pvalue=diff_dom([domlist_cell_order,cluster_list,celltype_list,res,chrom,chromsize])
bin_names=["chr"+chrom+":"+str(i*res)+"-"+str((i+1)*res) for i in range(np.shape(sc_dom)[1])]
cellid_names=[cell.split("/")[-1].split("_chr")[0] for cell in domlist_cell_order]
sc_dom_df=pd.DataFrame(sc_dom,index=cellid_names,columns=bin_names)
dom_prob_df=pd.DataFrame(dom_prob,index=celltype_list,columns=bin_names)
pval_df=pd.DataFrame(pvalue,index=[col.split("\t")[0]+":"+col.split("\t")[1]+"-"+col.split("\t")[2] for col in bins],columns=["pvalue"])
sc_dom_df.to_csv("chr"+chrom+"_scDomainBoundaries.csv")
dom_prob_df.to_csv("chr"+chrom+"_scDomainProbabilities.csv")
pval_df.to_csv("chr"+chrom+"_scDomainProbabilities_pval.csv")
This is so we maintain transchromosomal interactions.
#merge raw matrices for whole chr comparisons
import glob
import pysam
import pandas as pd
from collections import defaultdict
import time
import sys, os
# Disable
def blockPrint():
sys.stdout = open(os.devnull, 'w')
# Restore
def enablePrint():
sys.stdout = sys.__stdout__
def read_pair_generator(bam):
"""
Generate read pairs in a BAM file or within a region string.
Reads are added to read_dict until a pair is found.
"""
read_dict = defaultdict(lambda: [None, None])
for read in bam.fetch(until_eof=True):
qname = read.query_name
if qname not in read_dict:
if read.is_read1:
read_dict[qname][0] = read
else:
read_dict[qname][1] = read
else:
if read.is_read1:
yield read, read_dict[qname][1]
else:
yield read_dict[qname][0], read
del read_dict[qname]
with open("/home/groups/oroaklab/adey_lab/projects/sciWGS/200522_s3WGS_CRC/gcc_cisdistal_triplesparse/scHiCluster.annot","r") as annot:
annot_df=pd.DataFrame([line.split() for line in annot],columns=["cellID","cluster"])
annot_df["cluster"]=[x[1] for x in annot_df["cellID"].str.split(pat="_")] #using cell lines
annot_df["cellname"]=[line.split("_")[2] for line in annot_df["cellID"]]
bam_dir="/home/groups/oroaklab/adey_lab/projects/sciWGS/200522_s3WGS_CRC/" #working directory
bam_bulk=["hg38.s3GCC_4671.bbrd.q10.filt.bam","hg38.s3GCC_4442.bbrd.q10.filt.bam"] #list of bam files
#Takes read 1 and read 2 paired by query name with the above function
dir_out="/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/for_gurkan/"
i=0 # start counter
start_time = time.time() # start timer
chr_set=["chr"+str(i) for i in range(1,23)]
blockPrint()
for clus in annot_df["cluster"].unique():
cell_set=list(annot_df.loc[annot_df["cluster"]==str(clus)]["cellID"])
with open(dir_out+"s3GCC_clus"+clus+".bulk","w") as fout:
for bulk in bam_bulk:
cellline=bulk.split(".")[1]
bam = pysam.AlignmentFile(bam_dir+bulk, 'rb')
for read1, read2 in read_pair_generator(bam): #set paired reads together
if read1 is not None and read2 is not None: #remove reads which dont pair
chr1=bam.get_reference_name(read1.reference_id) #set chr1 name
chr2=bam.get_reference_name(read2.reference_id) #set chr2 name
if (chr1 in chr_set) and (chr2 in chr_set):
if ((chr1 == chr2) and (abs(read1.template_length)>=1000)) or (chr1 is not chr2): #so same chr >1kb or diff chr
i+=1
if i % 10000 == 0:
enablePrint()
print("Processed "+str(i)+" distal reads in "+ str(time.time()-start_time) +" seconds.")
blockPrint()
str1="+"
str2="+"
if read1.is_reverse:
str1="-"
if read2.is_reverse:
str2="-"
pos1=read1.reference_start
pos2=read2.reference_start
cellid_out=cellline+"_"+read1.query_name.split(":")[0] #assign read to cellID
if cellid_out in cell_set: #check to make sure it was a cellID used in clustering
out_list=[read1.query_name,chr1,pos1,str1,chr2,pos2,str2]
fout.write("\t".join(str(item) for item in out_list)+"\n")
#1. Read Name (can be blank)
#2. chromosome for read 1
#3. positions for read 1 (5' end of read, one-indexed)
#4. strand of read 1 (+ or -)
#5. chromosome for read 2
#6. positions for read 2 (5' end of read, one-indexed)
#7. strand of read 2 (+ or -)
import subprocess as sp
### Using awk for final formatting of text into HOMER input.
cmd= """for i in `ls /home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/for_gurkan/*bulk`; do awk 'OFS="\t" {if ($2 > $5){ print $1,$5,$6,$7,$2,$3,$4} else {print}}' $i | awk 'OFS="\t" {if (($3 > $6) && ($2 == $5)) {print $1,$5,$6,$7,$2,$3,$4} else {print}}' | sort -T . -S 2G -k2,2d -k5,5d -k3,3n -k6,6n > $i.sorted.hicSummary ; done"""
p = sp.Popen(cmd, stdin=sp.PIPE, stdout = sp.PIPE, stderr = sp.PIPE,shell=True)
p.wait()
### Using Homer for generation of the raw formatted HiC matrix
#Resolution of 2.5MB and uwing 20CPUs
cmd="""for i in clus4442 clus4671;
do /home/groups/oroaklab/src/homer/bin/makeTagDirectory s3GCC_${i} -format HiCsummary s3GCC_${i}.bulk.sorted.hicSummary;
/home/groups/oroaklab/src/homer/bin/analyzeHiC s3GCC_${i} -res 2500000 -raw -cpu 20 > $i.hic.matrix; done"""
p = sp.Popen(cmd, stdin=sp.PIPE, stdout = sp.PIPE, stderr = sp.PIPE,shell=True)
p.wait()
I am both looping through individual clusters and performing a cluster by cluster difference in contacts.
library(ComplexHeatmap)
library(HiCcompare)
library(circlize)
library(EnsDb.Hsapiens.v86)
setwd("/home/groups/oroaklab/adey_lab/projects/sciWGS/200730_s3FinalAnalysis/s3gcc_data/gcc_cisdistal_triplesparse")
read_dat<-function(x){
dat<-read.table(x)
bincount<-seqlengths(EnsDb.Hsapiens.v86)["3"] %/% 1000000
bins<-paste0("bin_",0:bincount)
dat$V1<-paste0("bin_",dat$V1)
dat$V2<-paste0("bin_",dat$V2)
dat_out<-data.frame(matrix(0,ncol=length(bins),nrow=length(bins)))
colnames(dat_out)<-bins
row.names(dat_out)<-bins
for (row in 1:nrow(dat)) {
dat_out[dat[row,1],dat[row,2]] <- dat[row,3]}
dat_out[which(is.na(dat_out),arr.ind=T)]<-0
name_out<-substr(x,1,nchar(x)-4)
plt_dat(dat_out,name_out)
return(dat_out)
}
plt_dat<-function(x,file_name){
dat<-x
col_fun<-viridis(10)
col_fun<-colorRamp2(c(0,1,3,20),c("#450256","#5AC865","#21908D","#F9E721"))
pdf(paste0(file_name,".inter.hic.png"))
plt<-Heatmap(as.matrix(dat),
row_order=1:nrow(dat),
column_order=1:ncol(dat),
show_row_names=FALSE,
show_column_names=FALSE,
col = col_fun)
print(plt)
dev.off()
system(paste0("slack -F ",file_name,".inter.hic.png", " ryan_todo"))
}
for (i in c(as.character(1:22),"X")){
i="3"
files_in<-list.files(pattern=paste0("*chr",i,".txt"))
for (x in files_in){
dat<-read_dat(x)
}}
#plot each cluster
lapply(c("clus4442.hic.matrix","clus4671.hic.matrix"),FUN=plt_dat)
clus4442<-norm_dat("clus4442.hic.matrix")
clus4671<-norm_dat("clus4671.hic.matrix")
plt_chr<-function(x=clus4442,y=clus4671,chr_in){
x<-x[grepl(paste0(chr_in,"-"),colnames(x)),grepl(paste0(chr_in,"-"),rownames(x))]
col_fun_4442 = colorRamp2(c(0,as.numeric(quantile(unlist(x), probs = c(0.90))),as.numeric(quantile(unlist(x), probs = c(0.99))))
, c("#fff5eb", "#f16913","#7f2704"))
plt_4442<-Heatmap(as.matrix(x),
cluster_rows=F,
cluster_columns=F,
show_row_names=FALSE,
show_column_names=FALSE,
col=col_fun_4442,
column_title=paste("CRC 4442",chr_in),
row_order=1:nrow(x),
column_order=1:ncol(x))
y<-y[grepl(paste0(chr_in,"-"),colnames(y)),grepl(paste0(chr_in,"-"),rownames(y))]
col_fun_4671 = colorRamp2(c(0,as.numeric(quantile(unlist(y), probs = c(0.90))),as.numeric(quantile(unlist(y), probs = c(0.99))))
, c("#fff5eb", "#f16913","#7f2704"))
plt_4671<-Heatmap(as.matrix(y),
cluster_rows=F,
cluster_columns=F,
show_row_names=FALSE,
show_column_names=FALSE,
col=col_fun_4671,
column_title=paste("CRC 4671",chr_in),
row_order=1:nrow(y),
column_order=1:ncol(y))
pdf(paste0("cellline_",chr_in,".intra.hic.png"),width=10)
print(plt_4442+plt_4671)
dev.off()
system(paste0("slack -F ",paste0("cellline_",chr_in,".intra.hic.png"), " ryan_todo"))
}
for (i in paste0("chr",1:20)){plt_chr(chr_in=i,x=clus4442,y=clus4671)}
match_data_chr<-function(i,dat_mat){
out<-unlist(lapply(strsplit(colnames(dat_mat),"-"),"[",1)) %in% strsplit(row.names(dat_mat)[i],"-")[[1]][1]
return(out)}
sel_chr<-function(i,j,dat_mat){
out_row<-unlist(lapply(strsplit(colnames(dat_mat),"-"),"[",1)) %in% i
out_col<-unlist(lapply(strsplit(colnames(dat_mat),"-"),"[",1)) %in% j
out_matched<-out_row & out_col
return(out_matched)
}
diff_mat<-function(x,y){
if (x !=y){
print(x)
print(y)
dat_ref<-read.table(x,skip=1)
dat_ref<-dat_ref[2:ncol(dat_ref)]
row.names(dat_ref)<-dat_ref[,1]
dat_ref<-dat_ref[2:ncol(dat_ref)]
colnames(dat_ref)<-row.names(dat_ref)
dat_ref<-as.matrix(dat_ref)
#dat_ref[do.call("rbind",lapply(1:nrow(dat_ref),FUN=match_data_chr,dat_mat=dat_ref))]<-0
#dat_ref<-dat_ref[sel_chr("chr15","chr15",dat_ref),sel_chr("chr15","chr15",dat_ref)] #This can be used for selecting individual chromosomes for comparison
dat_ref<-KRnorm(dat_ref)
dat_test<-read.table(y,skip=1)
dat_test<-dat_test[2:ncol(dat_test)]
row.names(dat_test)<-dat_test[,1]
dat_test<-dat_test[2:ncol(dat_test)]
colnames(dat_test)<-row.names(dat_test)
dat_test<-as.matrix(dat_test)
#dat_test[do.call("rbind",lapply(1:nrow(dat_test),FUN=match_data_chr,dat_mat=dat_test))]<-0
#dat_test<-dat_test[sel_chr("chr15","chr15",dat_test),sel_chr("chr15","chr15",dat_test)]
dat_test<-KRnorm(dat_test)
dat_ref<-dat_ref[(colnames(dat_ref) %in% colnames(dat_ref)) & (colnames(dat_ref) %in% colnames(dat_test)),(row.names(dat_ref) %in% row.names(dat_ref)) & (row.names(dat_ref) %in% row.names(dat_test))]
dat_test<-dat_test[(colnames(dat_test) %in% colnames(dat_ref)) & (colnames(dat_test) %in% colnames(dat_test)),(row.names(dat_test) %in% row.names(dat_ref)) & (row.names(dat_test) %in% row.names(dat_test))]
dat_diff<-dat_test-dat_ref
x<-strsplit(x,"[.]")[[1]][1]
y<-strsplit(y,"[.]")[[1]][1]
chr_split<-unlist(lapply(strsplit(row.names(dat_diff),"-"),"[",1))
chr_split<-as.numeric(unlist(lapply(strsplit(chr_split,"r"),"[",2)))
col_func = colorRamp2(as.numeric(quantile(unlist(dat_diff), probs = c(0.001,0.1,0.5,0.9,0.999))), c("#d7191c", "#fdae61", "white", "#abdda4", "#2b83ba"))
dat_diff[which(dat_diff %in% c(NaN,-Inf,Inf),arr.ind=T)]<-NaN
pdf(paste0(x,"by",y,".difftest.heatmap.pdf"))
plt<-Heatmap(as.matrix(dat_diff),
na_col="white",
cluster_rows=F,
cluster_columns=F,
show_row_names=FALSE,
show_column_names=FALSE,
row_split=chr_split,
column_split=chr_split,
col=col_func)
print(plt)
dev.off()}
}
for (y in c("clus0.hic.matrix","clus1.hic.matrix","clus2.hic.matrix")){
lapply(c("clus0.hic.matrix","clus1.hic.matrix","clus2.hic.matrix"),FUN=diff_mat,y=y)
}
#I've been having a strange bug with KRnorm, where within this function it stalls and doesn't complete, but if clusters are run individually it is fine.
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS: /home/groups/oroaklab/src/R/R-4.0.0/lib/libRblas.so
LAPACK: /home/groups/oroaklab/src/R/R-4.0.0/lib/libRlapack.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 parallel grid stats graphics grDevices utils
[8] datasets methods base
other attached packages:
[1] EnsDb.Hsapiens.v86_2.99.0 ensembldb_2.13.1
[3] AnnotationFilter_1.13.0 GenomicFeatures_1.41.3
[5] AnnotationDbi_1.51.3 Biobase_2.49.1
[7] GenomicRanges_1.41.6 GenomeInfoDb_1.25.11
[9] IRanges_2.23.10 S4Vectors_0.27.12
[11] BiocGenerics_0.35.4 circlize_0.4.10
[13] HiCcompare_1.12.0 reshape2_1.4.4
[15] ComplexHeatmap_2.5.5 TopDom_0.9.1
[17] Rphenograph_0.99.1 igraph_1.2.6
[19] gridExtra_2.3 dplyr_1.0.2
[21] ggplot2_3.3.2
loaded via a namespace (and not attached):
[1] colorspace_1.4-1 rjson_0.2.20
[3] ellipsis_0.3.1 CGHcall_2.52.0
[5] DNAcopy_1.63.0 XVector_0.29.3
[7] GlobalOptions_0.1.2 clue_0.3-57
[9] listenv_0.8.0 bit64_4.0.5
[11] codetools_0.2-16 splines_4.0.0
[13] R.methodsS3_1.8.1 impute_1.64.0
[15] Rsamtools_2.5.3 cluster_2.1.0
[17] dbplyr_1.4.4 png_0.1-7
[19] R.oo_1.24.0 pheatmap_1.0.12
[21] compiler_4.0.0 httr_1.4.2
[23] assertthat_0.2.1 Matrix_1.2-18
[25] lazyeval_0.2.2 limma_3.45.13
[27] prettyunits_1.1.1 tools_4.0.0
[29] gtable_0.3.0 glue_1.4.2
[31] GenomeInfoDbData_1.2.3 RANN_2.6.1
[33] rappdirs_0.3.1 Rcpp_1.0.5
[35] vctrs_0.3.4 Biostrings_2.57.2
[37] rhdf5filters_1.1.2 nlme_3.1-149
[39] rtracklayer_1.49.5 QDNAseq_1.26.0
[41] stringr_1.4.0 globals_0.13.0
[43] lifecycle_0.2.0 gtools_3.8.2
[45] XML_3.99-0.5 InteractionSet_1.18.0
[47] future_1.19.1 zlibbioc_1.35.0
[49] scales_1.1.1 ProtGenerics_1.21.0
[51] hms_0.5.3 SummarizedExperiment_1.19.6
[53] rhdf5_2.33.7 RColorBrewer_1.1-2
[55] curl_4.3 memoise_1.1.0
[57] biomaRt_2.45.2 CGHbase_1.50.0
[59] stringi_1.5.3 RSQLite_2.2.1
[61] BiocParallel_1.23.2 shape_1.4.5
[63] rlang_0.4.8 pkgconfig_2.0.3
[65] matrixStats_0.57.0 bitops_1.0-6
[67] lattice_0.20-41 purrr_0.3.4
[69] Rhdf5lib_1.11.3 GenomicAlignments_1.25.3
[71] bit_4.0.4 tidyselect_1.1.0
[73] plyr_1.8.6 magrittr_1.5
[75] R6_2.5.0 generics_0.0.2
[77] DelayedArray_0.15.7 DBI_1.1.0
[79] pillar_1.4.6 withr_2.3.0
[81] mgcv_1.8-33 RCurl_1.98-1.2
[83] tibble_3.0.4 future.apply_1.6.0
[85] crayon_1.3.4 KernSmooth_2.23-17
[87] BiocFileCache_1.13.1 GetoptLong_1.0.3
[89] progress_1.2.2 data.table_1.13.0
[91] marray_1.68.0 blob_1.2.1
[93] digest_0.6.27 R.utils_2.10.1
[95] openssl_1.4.3 munsell_0.5.0
[97] askpass_1.1