Dpuiu CA

From Cbcb
Revision as of 14:15, 17 August 2009 by Dpuiu (talk | contribs) (→‎Posmap)
Jump to navigation Jump to search

Sourceforge

 cvs -z3 -d:pserver:anonymous@wgs-assembler.cvs.sourceforge.net:/cvsroot/wgs-assembler co -P wgs-assembler
  • CBCB
 ~/szdevel/SourceForge/wgs-5.4/
  • Compile for debugging
 BUILDDEBUG  = 1

Run

  • Help
runCA.pl -fields
  • Batch submission
 touch runCA.log
   echo --- runCA $$ restarted by $USER at `date` --- >>& runCA.log
   runCA -d . -p prefix -s spec prefix.frg >>& runCA.log
 at runCA.sh   
  • Sanger
runCA -d . -p prefix -s spec prefix.frg
  • doOverlapTrimming=1 (default)
  • low qual reads (ex: 20 or 'D') are discarded
  • astatLowBound 1 (default)  ; can be lowered for low/un-uniform coverage
  • astatHighBound 5 (default)
  • computeInsertSize=0 (default)
  • 454 & Hybrid
runCA -d . overlapper=mer unitigger=bog -p prefix *frg *.sff
  • gatekeeper -L : search for 454 paired end linker
  • AS_GKP_sff.c  : Load_SFF function
  • for mated libs set mean=3000,stdev=300

Parameters

  • stopAfter:
   initialStoreBuilding
   overlapBasedTrimming | OBT
   overlapper
   unitigger
   consensusAfterUnitigger
   scaffolder
   consensusAfterScaffolder
  • cgwOutputIntermediate = 1 => "cgw -G"
 => .cgw & .cgw_contigs (generated only at the last doExtendClearRanges step)
  • doExtendClearRanges = 2 => cgw run 3 times, ECR 2 times

Gatekeeper

Build

Add

Update

  • Using messages : act:R !!!
 {DST
 act:R
 acc:10691
 mea:161662
 std:28760
 }
  • Using tab file
 Example: No restrictions on lib stddev with --edit option
 cat prefix.update
 lib     uid     19070   mean    167001
 lib     uid     19070   stddev  167001
 gatekeeper --edit prefix.update prefix.gkpStore > prefix.gkpStore.seq

Delete

  • DST messages can not be deleted
  • A FRG can be deleted only if there is no related LKG message; an abbreviated MSG
 {LKG
 act:D
 typ:M
 fg1:94376
 fg2:94654
 }
 {FRG
 act:D
 acc:94376
 }

Dump

  • FASTA sequences
 gatekeeper -dumpfragments -withsequence prefix.gkpStore > prefix.seq
  • FRG file
 gatekeeper -dumpfrg prefix.gkpStore > prefix.frg
  • ALL READS in TAB format
 gatekeeper -dumpfragments -tabular prefix.gkpStore > prefix.frg.info
  • ALL LIBS in TAB format
 gatekeeper -dumplibraries -tabular prefix.lib.info
  • READ CLR's (default=ECR2)
 gatekeeper -dumpfragments -tabular prefix.gkpStore/  | perl -ane 'print join "\t", @F[0,-2,-1]; print "\n";' > prefix.clr
 gatekeeper -dumpfragments -tabular -clear [ORIG|QLT|VEC|OBTINI|OBT|UTG|ECR1|ECR2] prefix.gkpStore/  | perl -ane 'print join "\t", @F[0,-2,-1]; print "\n";' > prefix.?.clr
  • Subset
 touch file.uids
 gatekeeper -uid file.uids ...

Meryl

OBT

Unitiger

CGW

Terminator

Log file & runtime (wgs-5.2)

Output processing

Posmap

  • Get posMap files
buildPosMap -o prefix < prefix.asm
  • Get number of reads/ctg or scf (from the pos files)
 cat prefix.posmap.frgctg | count.pl > prefix.posmap.frgctg.count
 cat prefix.posmap.frgctg | count.pl > prefix.posmap.frgscf.count
 cat prefix.posmap.ctgscf | count.pl > prefix.posmap.ctgscf.count
 ...
  • Compute read/mate/ctg cvg (from the pos files)
 posmap2cvg.pl                            < prefix.posmap.frgscf > prefix.posmap.frgscf.frg_cvg
 posmap2cvg.pl -mates prefix.posmap.mates < prefix.posmap.frgscf > prefix.posmap.frgscf.mate_cvg
 posmap2cvg.pl                             < prefix.posmap.ctgscf > prefix.posmap.ctg_cvg
  • Qc stats based on posmap files
 posmap2qc.pl -ctgscf prefix.posmap.ctgscf > prefix.qc
  • Break scf/ctg pipeline
 ~/bin/breakPosmapKeep.amos
  • Add ctg/scf into to the mates fils
 join12.pl prefix.posmap.mates prefix.posmap.frgctg > prefix.posmap.mates.ctg2
 join12.pl prefix.posmap.mates.ctg2 prefix.posmap.frgscf > prefix.posmap.mates.ctg2.scf2
  • Get links
 ~/bin/join12.pl prefix.posmap.mates prefix.posmap.frgscf | perl -ane 'next if($F[3] eq $F[7]); @F[3,7]=@F[7,3] if($F[3] gt $F[7]); print join "\t",@F[3,7,2]; print "\n";' | count.pl > prefix.posmap.scf2scf
 ~/bin/join12.pl prefix.posmap.mates prefix.posmap.frgctg | perl -ane 'next if($F[3] eq $F[7]); @F[3,7]=@F[7,3] if($F[3] gt $F[7]); print join "\t",@F[3,7,2]; print "\n";' | count.pl > prefix.posmap.ctg2ctg
 ~/bin/join12.pl prefix.posmap.mates prefix.posmap.frgscf prefix.posmap.frgdeg | perl -ane 'next if($F[3] eq $F[7]); @F[3,7]=@F[7,3] if($F[3] gt $F[7]); print join "\t",@F[3,7,2]; print "\n";' | count.pl > prefix.posmap.scf2deg
  • Get ctg/deg/scaffold links from the posmap files
 ~/bin/join12.pl -0 prefix.posmap.mates prefix.posmap.frgscf prefix.posmap.frgdeg >! prefix.posmap.mates2
 cat prefix.posmap.mates2 | awk '{print $3,$4,$8}' | p '@F[1,2]=@F[2,1] if ($F[1] gt $F[2]); print join  "\t",@F; print "\n"' | count.pl >  prefix.posmap.mates2.count
 cat prefix.posmap.mates2.count | grep -v '\.' | p 'print $_ if($F[1] ne $F[2]);' > prefix.posmap.mates2.count.diff
  
 cat prefix.posmap.mates2.count.diff | grep diffScaff | cut -f2,3,4 | ~/bin/cluster.pl > ! prefix.posmap.mates2.count.diffScaff.cluster
 cat prefix.posmap.mates2.7180002040716-7180002040985 | perl -ane '@F[0,1, 3,4,5,6, 7,8,9,10 ]=@F[1,0, 7,8,9,10,  3,4,5,6, ] if ($F[3] gt $F[7]); print join "\t",@F; print "\n";' | awk '{print $7,$11}' | count.pl
  • Display graphs

Fasta

  • Align deg to scaff
 nucmer -mum -l 40 -c 250 scf.fasta deg.fasta  -p scf-deg
 delta-filter -q scf-deg.delta > scf-deg.filter-q.delta
 ~/bin/delta2pairCvg+.pl < scf-deg.filter-q.delta > scf-deg.filter-q.cvg.pairs
 ~/bin/max2.pl -i 4 -j 6 < scf-deg.filter-q.cvg.pairs > scf-deg.filter-q.maxCvg.pairs
 cat scf-deg.filter-q.maxCvg.pairs | p 'print $_ if($F[5]-$F[6]<2000);' > scf-deg.filter-q.variants.pairs
 awk '{print $5,$6}' scf-deg.filter-q.variants.pairs > scf-deg.filter-q.variants.ids
  • Indexing
 # contigs
 ~/bin/formatdb.pl < prefix.ctg.fasta > prefix.ctg.fasta.offset
 ~/bin/fastacmd.pl -o prefix.ctg.fasta.offset -f filter.ctg.ids < prefix.ctg.fasta > filter.ctg.fasta
 # same for scaff, deg ...
  • Contaminant search : nucmer against
 /nfshomes/dpuiu/db/Ecoli.all
 /nfshomes/dpuiu/db/UniVec_Core
 /nfshomes/dpuiu/db/OtherVec

Asm

  • Get output seq
 asmOutputFasta -p prefix < prefix.asm
  • Get output seq clr
 ~/bin/asm2clrs.pl < prefix.asm > < prefix.asm.clr
  • Get new library insert sizes:
 asm2mdi.pl < prefix.asm > prefix.mdi
  • Get surrogate ids
 #most of them have negative aStat's
 $ cat *.asm | grep -B 8 ^sta:S | grep ^acc | perl -ane '/(\d+)/; print "utg".$1,"\n";'

Qc

 caqc.pl -euid  prefix.asm

Scaffolds

  • Get scaff files
 cavalidate -e 10 prefix
 bank2scaff prefix.bnk/ > prefix.scaff
 ~/bin/ca2scaff -i prefix.asm -o prefix.scaff
 cat prefix.scaff | grep "^>" | sed 's/>//' | awk '{print $1,$2}' > prefix.scfctg
 cat prefix.scaff | grep "^>" | sed 's/>//' | awk '{print $1,$3}' > prefix.scflen  
 cat prefix.scaff | grep "^>" | sed 's/>//' | awk '{print $1,$4}' > prefix.scfspan  
 tac prefix.scaff | p '$count=0 if(/^>/); $count++; print $_ if($count>2);' | awk '{print $4}'   # sequence gaps

AGP

 asmToAGP.pl < prefix.asm > prefix.agp


        /--- buildPosMap ------> .posmap
       /
  .asm /---- asmToAGP ------------------------------------------------> .AGP 
      \                                                              /   
       \---- cavalidate, bank2scaff?--> .scaff --- scaff2agp.pl --- 
  • Other scripts:
 bank2scaff : incorrect gap sizes
 valiadteAgp.pl < prefix.agp
 infoseq 9-terminator/prefix.ctg.fasta | sed 's/ctg//' >! prefix.ctg.infoseq
 scaff2agp.pl -i prefix.ctg.infoseq < prefix.scaff > prefix.agp

Running subassemblies

Update frg file:

  • clr=ECR2
  • MDI -> DST

runCA parameters:

  • doOBT=no

Test datasets

Porphyromonas_gingivalis_W83 : Sanger + 454

 wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/porphyromonas_gingivalis_w83/anc.porphyromonas_gingivalis_w83.001.gz
 ...
 wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/porphyromonas_gingivalis_w83/xml.porphyromonas_gingivalis_w83.001.gz
 /fs/sztmpscratch/dpuiu/porphyromonas_gingivalis_w83/

Input formatting

TA data

 ~/wgs/Linux-amd64/bin/tracedb-to-frg.pl -xml xml.porphyromonas_gingivalis_w83.001.gz
 ~/wgs/Linux-amd64/bin/tracedb-to-frg.pl -lib xml.porphyromonas_gingivalis_w83.001.gz
 ~/wgs/Linux-amd64/bin/tracedb-to-frg.pl -frg xml.porphyromonas_gingivalis_w83.001.gz
 => frg FORMAT2 
 => TI are frg uids
  • Running
 ~/wgs/Linux-amd64/bin/runCA -p pging -d testassembly porphyromonas_gingivalis_w83.*