# pw-align MHC sequences (haplotypes) minimap2 -c -x asm20 -X -t 4 MHC/MHC.fa.gz MHC/MHC.fa.gz | gzip > MHC.paf.gz # Creates a graph from pw-aligned sequences seqwish -s MHC/MHC.fa.gz -p MHC.paf.gz -t 4 -b MHC -g MHC.gfa # Convert gfa to vg format. We chopped it because vg gcsa does not support nodes > 1024 vg view -F MHC.gfa -v | vg mod -X 256 -M 8 - | vg sort - > MHC.vg # Build indices vg index -x MHC.xg MHC.vg vg prune -k 16 -e 3 -s 0 MHC.vg > MHC.pruned.vg vg index -g MHC.gcsa MHC.pruned.vg # Concatenate all FASTA files for new haplotypes and flatten the sequences because vg mgsa complains. It does complain even if we do this :( Probably because seqs are highly conserved cat MHC/IMGT_alleles/*_gen.fasta > all_gen.fasta awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < all_gen.fasta > all_gen_flat.fasta # Upgrade the MHC graph with new haplotype sequences. This takes tooooo long because it makes an index every time it aligns a sequence. vg msga -f all_gen.fasta -g MHC.vg -t 4 > all.vg # Calculate depth of short reads (illumina) samtools depth MHC/TrioData/illumina/HG00512.bam | awk 'BEGIN {FS="\t"} {qual += $3} END {print qual / NR}' samtools depth MHC/TrioData/illumina/HG00513.bam | awk 'BEGIN {FS="\t"} {qual += $3} END {print qual / NR}' samtools depth MHC/TrioData/illumina/HG00514.bam | awk 'BEGIN {FS="\t"} {qual += $3} END {print qual / NR}' # Calculate the genome size samtools faidx MHC/MHC.fa cat MHC/MHC.fa.fai # Edit config.yaml to run error correction of long reads, for every one of the three parent-child samples. Using snakemake # We killed vg mgsa because it creates an index every time it aligns a new seq. # Instead, we run minimap2 so that it only keeps the first best alignment minimap2 -cx asm20 -t 8 -a --secondary=no MHC/MHC.fa MHC/IMGT_alleles/hla_gen.fasta > hla_gen.MHC.sam # Then vg inject to make a gam alignment vg inject -x MHC.xg -t 8 hla_gen.MHC.sam > hla_gen.MHC.gam # We need to augment the graph with the new haplotypes vg augment -i MHC.vg hla_gen.MHC.gam -A MHC.aug.gam > MHC.aug.vg # Build Index vg index -x MHC.aug.xg MHC.aug.vg # Map (error-corrected) reads to graph from Chinese parent-child GraphAligner -g MHC.aug.vg -f corrected_reads/output2/corrected.fa -a HG00512.gam GraphAligner -g MHC.aug.vg -f corrected_reads/output3/corrected.fa -a HG00513.gam GraphAligner -g MHC.aug.vg -f corrected_reads/output4/corrected.fa -a HG00514.gam # Map Long Reads (HG002) to graph GraphAligner -t 2 -g MHC.aug.vg -f MHC/HG002/HG002.PacBio.15kbCCS.Q20.hs37d5.pbmm2.MAPQ60.HP10xtrioRTG.MHConly.newsplit.28498559.H1.fastq.gz -a HG002.pb.H1.gam & GraphAligner -t 2 -g MHC.aug.vg -f MHC/HG002/HG002.PacBio.15kbCCS.Q20.hs37d5.pbmm2.MAPQ60.HP10xtrioRTG.MHConly.newsplit.28498559.H2.fastq.gz -a HG002.pb.H2.gam & GraphAligner -t 2 -g MHC.aug.vg -f MHC/HG002/HG002.PacBio.15kbCCS.Q20.hs37d5.pbmm2.MAPQ60.HP10xtrioRTG.MHConly.newsplit.28498559.untagged.fastq.gz -a HG002.pb.untagged.gam & GraphAligner -t 2 -g MHC.aug.vg -f MHC/HG002/HG002_Promethion.MHConly.28498559.H1.fastq.gz -a HG002.prom.H1.gam & GraphAligner -t 2 -g MHC.aug.vg -f MHC/HG002/HG002_Promethion.MHConly.28498559.H2.fastq.gz -a HG002.prom.H2.gam & GraphAligner -t 2 -g MHC.aug.vg -f MHC/HG002/HG002_Promethion.MHConly.28498559.untagged.fastq.gz -a HG002.prom.untagged.gam & # Pack alignments to get coverage (per base) vg pack -x MHC.aug.xg -g HG00512.gam -d > HG00512.cov.txt vg pack -x MHC.aug.xg -g HG00513.gam -d > HG00513.cov.txt vg pack -x MHC.aug.xg -g HG00514.gam -d > HG00514.cov.txt vg pack -x MHC.aug.xg -g HG002.pb.H1.gam -d > HG002.pb.H1.cov.txt vg pack -x MHC.aug.xg -g HG002.pb.H2.gam -d > HG002.pb.H2.cov.txt vg pack -x MHC.aug.xg -g HG002.pb.untagged.gam -d > HG002.pb.untagged.cov.txt vg pack -x MHC.aug.xg -g HG002.prom.H1.gam -d > HG002.prom.H1.cov.txt vg pack -x MHC.aug.xg -g HG002.prom.H2.gam -d > HG002.prom.H2.cov.txt vg pack -x MHC.aug.xg -g HG002.prom.untagged.gam -d > HG002.prom.untagged.cov.txt # Extract nodelist for every path (corresponds to an haplotype) from graph vg paths -x MHC.aug.xg -X -Q "HLA:HLA" | vg view -a - | jq '{name: .name, nodes: ([.path.mapping[].position.node_id | tostring] | join(","))}' | grep "name" -A 1 | grep "^\-\-" -v > nodes_all.txt # Next we map the