Dpuiu CA
Jump to navigation
Jump to search
Sourceforge
- Constants:
cat src/AS_global.h ... #define AS_READ_MIN_LEN 64 #define AS_OVERLAP_MIN_LEN 40 ...
- Download & compile CVS tip
cvs -z3 -d:pserver:anonymous@wgs-assembler.cvs.sourceforge.net:/cvsroot/wgs-assembler co -P wgs-assembler cd wgs-assembler/src make SITE_NAME=LOCAL => executables under wgs-assembler/Linux-amd64/bin/
- Download & compile release
wget ... bzip2 -dc wgs-6.0-beta.tar.bz2 | tar -xf - cd wgs-6.0-beta cd kmer sh configure.sh gmake install cd ../src gmake cd ..
- Compile for debugging :
make SITE_NAME=LOCAL BUILDDEBUG=1 # in "c_make.as"
Run
- Help
runCA.pl -fields
- Batch submission
touch runCA.sh echo --- runCA $$ restarted by $USER at `date` --- >>& runCA.log ~/bin/runCA -d . -p prefix -s ~/bin/runCA.spec *.frg >>& runCA.log
at runCA.sh
- Sanger
runCA -d . -p prefix -s spec Sanger.frg
- doOverlapTrimming=1 (default)
- low qual reads (ex: 20 or 'D') are discarded
- astatLowBound 1 (default) ; can be lowered for low/un-uniform coverage
- astatHighBound 5 (default)
- computeInsertSize=0 (default)
- 454 & Hybrid : convert sff to frg
sffToCA -libraryname 454 -output 454 454.sff runCA -d . ovlOverlapper=mer unitigger=bog -p prefix *.frg runCA -d . -s ~/bin/runCA.454.spec -p prefix *.frg
- run concurently
cnsConcurrency=4 frgCorrConcurrency=4 merOverlapperExtendConcurrency=4 merOverlapperSeedConcurrency=4 ovlConcurrency=4 ovlCorrConcurrency=4 ...
- gatekeeper -L : search for 454 paired end linker
- AS_GKP_sff.c : Load_SFF function
- for mated libs set mean=3000,stdev=300
Parameters
- stopAfter:
initialStoreBuilding overlapBasedTrimming | OBT overlapper unitigger consensusAfterUnitigger scaffolder consensusAfterScaffolder Example: ~/bin/runCA -d . -p asm -s ~/bin/runCA.454.spec stopAfter=initialStoreBuilding Saureus.frg
- cgwOutputIntermediate = 1 => "cgw -G"
=> .cgw & .cgw_contigs (generated only at the last doExtendClearRanges step)
- doExtendClearRanges = 2 => cgw run 3 times, ECR 2 times
Restarting a job
# kill runCA ps -C perl # displays the runCA PID ps -C perl | tac | perl -ane 'system "kill -9 $F[0]\n";'
# in OBT/overlap ps -C perl | tac | perl -ane 'system "kill -9 $F[0]\n";' ps -C overlap | tac | perl -ane 'system "kill -9 $F[0]\n";' # OBT mkdir 0-overlaptrim-overlap/OLD mv 0-overlaptrim-overlap/*sh 0-overlaptrim-overlap/OLD
# overlap mkdir 1-overlapper/OLD mv 1-overlapper/*sh 1-overlapper/OLD
Gatekeeper
Build
- Create & add
gatekeeper -o prefix.gkpStore -T -F prefix*frg gatekeeper -a -o prefix.gkpStore -T -F prefix*frg
Update
- edit frg: act:R !!!
{DST act:R acc:10691 mea:161662 std:28760 }
- append
gatekeeper -a -o prefix.gkpStore -T -F edit.frg
- Using tab file
Example: No restrictions on lib stddev with --edit option
cat prefix.update lib uid 19070 mean 167001 lib uid 19070 stddev 167001 lib uid 19070 isNotRandom 1 lib uid 19070 Orientation O # outie orientation for Illimina libs >1K gatekeeper --edit prefix.update prefix.gkpStore
- Update read CLR/CLV
gatekeeper -a -r prefix.clr -o prefix.gkpStore # not in the wgs release gatekeeper -a -v prefix.clv -o prefix.gkpStore
Delete
- DST messages can not be deleted
- A FRG can be deleted only if there is no related LKG message; an abbreviated MSG
- act:D !!!
# frg1 {LKG act:D typ:M fg1:94376 fg2:94654 } {FRG act:D acc:94376 } # frg2 ??? {LKG act:D fg:94376 fg:94654 } {FRG act:D acc:94376 }
cat delete.ids | perl -ane 'print "{FRG\nact:D\nacc:$F[0]\n}\n";'> delete.frg gatekeeper -a -o prefix.gkpStore -T -F delete.frg
Dump
- INFO
gatekeeper -dumpinfo prefix.gkpStore
- LAST frg in store
gatekeeper -dumpinfo -lastfragiid prefix.gkpStore
- FASTA sequences
gatekeeper -dumpfastaseq prefix.gkpStore > prefix.seq gatekeeper -dumpfastaqlt prefix.gkpStore > prefix.qual
- FASTA reads for newbler
gatekeeper -dumpnewbler newbler prefix.gkpStore => prefix.fna,prefix.fna.qual grep "^>" prefix.fna | head -2 >SRR001355.3635.1a template=2020+2021 dir=F library=SRX000348 trim=20-117 >SRR001355.3635.1b template=2020+2021 dir=R library=SRX000348 trim=1-130
- FRG file
gatekeeper -dumpfrg prefix.gkpStore > prefix.frg gatekeeper -dumpfrg prefix.gkpStore -format2 > prefix.frg2
- READS in TAB format
gatekeeper -dumpfragments -tabular prefix.gkpStore > prefix.gkpStore.info gatekeeper -dumpfragments -tabular prefix.gkpStore | grep -v ^UID | awk '{print $1,$2}' > prefix.gkpStore.uid2iid
- LIBS in TAB format
gatekeeper -dumplibraries -tabular prefix.gkpStore | grep -v ^UID | awk '{print $1,$4,$5}' | sed 's/\.000//g'> prefix.dst # original vals asm2mdi.pl < prefix.asm > prefix.mdi # final vals
- READ CLR's (default=ECR2)
gatekeeper -dumpfragments -tabular prefix.gkpStore/ | perl -ane 'print join "\t", @F[0,-2,-1]; print "\n";' > prefix.clr gatekeeper -dumpfragments -tabular -clear [CLR|OBTINITIAL|OBTMERGE|OBTCHIMERA|ECR_0|ECR_1|ECR_2|LATEST] prefix.gkpStore/ | perl -ane 'print join "\t", @F[0,-2,-1]; print "\n";' > prefix.?.clr
gatekeeper -dumpfragments -tabular -clear CLR asm.gkpStore | perl -ane 'print join "\t", @F[0,-2,-1]; print "\n";' > asm.CLR.clr gatekeeper -dumpfragments -tabular -clear OBTINITIAL asm.gkpStore | perl -ane 'print join "\t", @F[0,-2,-1]; print "\n";' > asm.OBTINITIAL.clr & gatekeeper -dumpfragments -tabular -clear OBTCHIMERA asm.gkpStore | perl -ane 'print join "\t", @F[0,-2,-1]; print "\n";' > asm.OBTCHIMERA.clr & ... gatekeeper -dumpfragments -tabular -clear LATEST asm.gkpStore | perl -ane 'print join "\t", @F[0,-2,-1]; print "\n";' > asm.LATEST.clr &
- READ CLR's summary
gatekeeper -dumpfragments asm.gkpStore | grep ^fragmentClear | perl -ane ' print $1," ",$3-$2,"\n" if(/(\w+),(\d+),(\d+)$/);' | ~/bin/sum2.pl
- MATED
frg22mates.pl < prefix.frg > prerfix.mates
- Subset
touch file.uids gatekeeper -uid file.uids ...
Meryl
- Get mer frequencies
meryl -C -B -m 22 -s prefix.seq -o prefix.22mers
- Dump mer sequences
meryl -Dt -s prefix.22mers
- Dump histogram
meryl -Dh -s prefix.22mers | sort -nk2 -r | more
OBT
- runCA spec
doOverlapTrimming = 1
- command
overlap -G ... # -G to compute partial overlaps
- Dump
overlapStore -d ./0-overlaptrim/asm.obtStore
- Number of jobs: should be < fileListMax (1024*10 by default) otherwise overlapStore fails
~/bin/getOvlJobsCount.pl ovlHashBlockSize ovlRefBlockSize frgCount
Overlap
- Jobs
merOverlapperSeedBatchSize 100000 merOverlapperExtendBatchSize 75000 overmerry.sh : #frgs/merOverlapperSeedBatchSize jobs -> seeds/ dir olap-from-seeds.sh : #frgs/merOverlapperExtendBatchSize jobs -> olaps/ dir
- Dump
overlapStore -d prefix.ovlStore > prefix.ovl.tab overlapStore -d prefix.ovlStore | awk '{print $1}' | uniq -c | awk '{print $2,$1}' > prefix.ovlStore.count.tmp cat prefix.ovlStore.count.tmp | perl -ane 'foreach ($p+1..$F[0]-1) { print "$_ 0\n" } $p=$F[0]; print $_' > prefix.ovlStore.count rm prefix.ovlStore.count.tmp
- Get stats; median number of ovls (5'/3') for each lib
overlapStats -G prefix.gkpStore -O prefix.ovlStore/ -o prefix cat prefix.repeatmodel.lib.00*stats | egrep '^Lib|nSamples|median' | p 'chomp; if(/Lib/) {print "\n$_"} else { print " $F[1]"}' | pretty -o
Unitiger
- list unitigs
tigStore -g asm.gkpStore -t asm.tigStore 2 -D unitiglist # ids tigStore -g asm.gkpStore -t asm.tigStore 2 -D properties # properties
- list frags/consensus/layout in a unitig
tigStore -g asm.gkpStore -t asm.tigStore 2 -u 0 -d frags tigStore -g asm.gkpStore -t asm.tigStore 2 -u 0 -d consensus tigStore -g asm.gkpStore -t asm.tigStore 2 -u 0 -d layout
- delete a unitig : get & update layout
tigStore -g asm.gkpStore -t asm.tigStore 2 -u id -d layout > utg.id.layout cat utg.id.layout | grep -v FRG | p 'if(/^data.num_frags/) { print "data.num_frags\t0\n"} else { print $_ }' > utg.id.layout.delete tigStore -g asm.gkpStore -t asm.tigStore 2 -u id -R utg.id.layout.delete
- list deleted unitigs
tigStore -g asm.gkpStore -t asm.tigStore 2 -D unitiglist | perl -ane 'print $_ if($.==1 or $F[0]==122533);' | pretty maID isPresent isDeleted ptID svID fileOffset covStat id 1 1 11 1 3174408 113 ...
CGW
Consensus
- Dump unitig seq
tigStore -g asm.gkpStore -t asm.tigStore/ 20 -D unitigs > unitigs.txt cat unitigs.txt | grep -v ^maID | perl -ane 'system "tigStore -g asm.gkpStore -t asm.tigStore 20 -d consensus -u $F[0]\n";' >& unitigs.fasta.tmp cat unitigs.fasta.tmp | grep -v null | sed 's/cns=//' | sed 's/-//g' | nl0 | tab2fasta.pl > unitigs.fasta rm unitigs.fasta.tmp
- Dump contig seq
tigStore -g asm.gkpStore -t asm.tigStore/ 20 -D contigs > contigs.txt cat contigs.txt | grep -v ^maID | perl -ane 'system "tigStore -g asm.gkpStore -t asm.tigStore 20 -d consensus -c $F[0]\n";'>& contigs.fasta.tmp cat contigs.fasta.tmp | grep -v null | sed 's/cns=//' | sed 's/-//g' | nl0 | tab2fasta.pl > contigs.fasta rm contigs.fasta.tmp
- Recompute unitig consensus
utgcns -g ./asm.gkpStore -t ./asm.tigStore 2 011 -u 122533 -v -V
Terminator
Log file & runtime (wgs-5.2)
- Use ~dpuiu/bin/getCAruntimes.pl
cat runCA.log | ~/bin/getCAruntimes.pl -hour -total display runCA.runTimes.png
- runCA.log runCA.times bos taurus BCM.CLONEEND 16,875 Sanger reads
- runCA.log runCA.times wolbachia symbioint of culex pipiens quinquefasciatus 36,767 Sanger reads
Output processing
Posmap
- Get posMap files
buildPosMap -o prefix < prefix.asm
- Get number of reads/ctg or scf (from the pos files)
cat prefix.posmap.frgctg | count.pl -i 1 > prefix.posmap.ctgfrgcnt cat prefix.posmap.frgscf | count.pl -i 1 > prefix.posmap.scffrgcnt cat prefix.posmap.ctgscf | count.pl -i 1 > prefix.posmap.scfctgcnt
- Compute read/mate/ctg cvg (from the pos files)
posmap2cvg.pl < prefix.posmap.frgscf > prefix.posmap.scffrgcvg OLD/posmap2cvg.4.pl -mates prefix.posmap.mates < prefix.posmap.frgscf > prefix.posmap.scfmatecvg posmap2cvg.pl < prefix.posmap.ctgscf > prefix.posmap.scfctgcvg
- Get 0cvg regions
posmap2cvg.pl -f prefix.posmap.scflen -Max 0 < prefix.posmap.frgscf # scaff gaps posmap2cvg.pl -f prefix.posmap.ctglen -Max 0 < prefix.posmap.frgctg # surrogates
- Get links
~/bin/scf2scflnk.sh prefix ~/bin/ctg2ctglnk.sh prefix ...
- Add ctg/scf into to the mates fils
join12.pl prefix.posmap.mates prefix.posmap.frgctg > prefix.posmap.mates2.tmp join12.pl prefix.posmap.mates2.tmp prefix.posmap.frgscf > prefix.posmap.mates2 rm prefix.posmap.mates2.tmp
- Display graphs
cat prefix.posmap.scf2scflnk | ~/bin/tab2dot.pl | dot -Tpng -o prefix.posmap.scf2scflnk.png cat prefix.posmap.ctg2ctglnk | ~/bin/tab2dot.pl | dot -Tpng -o prefix.posmap.ctg2ctglnk.png
- Qc stats based on posmap files
posmap2qc.pl -ctgscf prefix.posmap.ctgscf > prefix.qc
- Break scf/ctg pipeline
~/bin/breakPosmapKeep.amos
- Get scaff file
~/bin/posmap2scaff.pl prefix.posmap.ctgscf
Fasta
- Align deg to scaff
nucmer -mum -l 40 -c 250 scf.fasta deg.fasta -p scf-deg delta-filter -q scf-deg.delta > scf-deg.filter-q.delta ~/bin/delta2pairCvg+.pl < scf-deg.filter-q.delta > scf-deg.filter-q.cvg.pairs ~/bin/max2.pl -i 4 -j 6 < scf-deg.filter-q.cvg.pairs > scf-deg.filter-q.maxCvg.pairs cat scf-deg.filter-q.maxCvg.pairs | p 'print $_ if($F[5]-$F[6]<2000);' > scf-deg.filter-q.variants.pairs awk '{print $5,$6}' scf-deg.filter-q.variants.pairs > scf-deg.filter-q.variants.ids
- Indexing
# contigs ~/bin/formatdb.pl < prefix.ctg.fasta > prefix.ctg.fasta.offset ~/bin/fastacmd.pl -o prefix.ctg.fasta.offset -f filter.ctg.ids < prefix.ctg.fasta > filter.ctg.fasta
# same for scaff, deg ...
- Contaminant search : nucmer against
/nfshomes/dpuiu/db/Ecoli.all /nfshomes/dpuiu/db/UniVec_Core /nfshomes/dpuiu/db/OtherVec
Asm
- Get output seq
asmOutputFasta -p prefix < prefix.asm
- Get output seq clr
~/bin/asm2clrs.pl < prefix.asm > < prefix.asm.clr
- Get new library insert sizes:
asm2mdi.pl < prefix.asm > prefix.mdi
- Get surrogate ids
#most of them have negative aStat's $ cat *.asm | grep -B 8 ^sta:S | grep ^acc | perl -ane '/(\d+)/; print "utg".$1,"\n";'
Qc
caqc.pl -euid prefix.asm
Scaffolds
- Get scaff files
cavalidate -e 10 prefix bank2scaff prefix.bnk/ > prefix.scaff
~/bin/ca2scaff -i prefix.asm -o prefix.scaff
cat prefix.scaff | grep "^>" | sed 's/>//' | awk '{print $1,$2}' > prefix.scfctg cat prefix.scaff | grep "^>" | sed 's/>//' | awk '{print $1,$3}' > prefix.scflen cat prefix.scaff | grep "^>" | sed 's/>//' | awk '{print $1,$4}' > prefix.scfspan
tac prefix.scaff | p '$count=0 if(/^>/); $count++; print $_ if($count>2);' | awk '{print $4}' # sequence gaps
AGP
asmToAGP.pl < prefix.asm > prefix.agp
/--- buildPosMap ------> .posmap / .asm /---- asmToAGP ------------------------------------------------> .AGP \ / \---- cavalidate, bank2scaff?--> .scaff --- scaff2agp.pl ---
- Other scripts:
bank2scaff : incorrect gap sizes
valiadteAgp.pl < prefix.agp
infoseq 9-terminator/prefix.ctg.fasta | sed 's/ctg//' >! prefix.ctg.infoseq scaff2agp.pl -i prefix.ctg.infoseq < prefix.scaff > prefix.agp
Running subassemblies
Update frg file:
- clr=ECR2
- MDI -> DST
runCA parameters:
- doOBT=no
Test datasets
Porphyromonas_gingivalis_W83 : Sanger + 454
wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/porphyromonas_gingivalis_w83/anc.porphyromonas_gingivalis_w83.001.gz ... wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/porphyromonas_gingivalis_w83/xml.porphyromonas_gingivalis_w83.001.gz
- SRA : http://www.ncbi.nlm.nih.gov/sites/entrez?db=genomeprj&cmd=Link&LinkName=genomeprj_sra&from_uid=29961
- CBCB
/fs/sztmpscratch/dpuiu/porphyromonas_gingivalis_w83/
- Complete genome at NCBI : NC_002950.2 ( 2,343,476bp 48.29%GC)
bp_fetch.pl net::genbank:NC_002950.2 > porphyromonas_gingivalis_w83.1con
Input formatting
Read counts
zcat anc.*.gz | ~/bin/traceanc2anc.pl SEQ_LIB_ID INSERT_SIZE INSERT_STDEV SVECTOR_CODE | count.pl
TA formatting
- frg FORMAT2
- TIs are frg uids
- All libs
tracedb-to-frg.pl -xml xml.porphyromonas_gingivalis_w83.001.gz tracedb-to-frg.pl -lib xml.porphyromonas_gingivalis_w83.001.gz tracedb-to-frg.pl -frg xml.porphyromonas_gingivalis_w83.001.gz => porphyromonas_gingivalis_w83.1.lib.frg porphyromonas_gingivalis_w83.2.001.frg.bz2 porphyromonas_gingivalis_w83.3.lkg.frg
- One lib at a time
tracedb-to-frg.pl -only T13146 -xml xml.porphyromonas_gingivalis_w83.001.gz >& ! tracedb-to-frg.log tracedb-to-frg.pl -only T13146 -lib xml.porphyromonas_gingivalis_w83.001.gz >>& tracedb-to-frg.log tracedb-to-frg.pl -only T13146 -frg xml.porphyromonas_gingivalis_w83.001.gz >>& tracedb-to-frg.log
454 SRA formatting
Illumina formatting
- use fastqToCA (wgs-6.0)
fastqToCA \
-insertsize 3000 300 \
-libraryname 1 \
-type illumina \
-fastq /fs/szattic-asmg4/Bees/Bombus_impatiens/s_1_1_sequence.txt,/fs/szattic-asmg4/Bees/Bombus_impatiens/s_1_2_sequence.txt | \
sed 's/ori:I/ori:O/' \
> 1.frg
=>
{LIB
act:A
acc:1
ori:O
mea:3000.000
std:300.000
src:
.
nft:10
fea:
forceBOGunitigger=1
isNotRandom=0
doNotTrustHomopolymerRuns=0
doRemoveDuplicateReads=0
doNotQVTrim=0
goodBadQVThreshold=0
doNotOverlapTrim=0
usePackedFragments=1
illuminaFastQType=illumina
illuminaSequence=/fs/szattic-asmg4/Bees/Bombus_impatiens/s_1_1_sequence.txt,/fs/szattic-asmg4/Bees/Bombus_impatiens/s_1_2_sequence.txt
.
}
gatekeeper \ -o asm.gkpStore.BUILDING \ -T -F \ -E asm.gkpStore.errorLog \ 1.frg => Segmentation fault
FASTQ
- This won't remove the linker:
zcat SRR001351.fastq.gz | ~/bin/fastq2seq.pl > SRR001351.seq zcat SRR001351.fastq.gz | ~/bin/fastq2qual.pl > SRR001351.qual convert-fasta-to-v2.pl -454 -l SRR001351 -s SRR001351.seq -q SRR001351.qual
SFF
sffToCA -libraryname SRR001351 -output SRR001351.frg SRR001351.sff
Running
- Command
~/wgs/Linux-amd64/bin/runCA -p pging -d testassembly porphyromonas_gingivalis_w83.* >& runCA.log