Dpuiu CA: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
No edit summary |
||
Line 1: | Line 1: | ||
= Sourceforge = | |||
* [http://sourceforge.net/projects/wgs-assembler/ Sourceforge] | * [http://sourceforge.net/projects/wgs-assembler/ Sourceforge] | ||
* [http://apps.sourceforge.net/mediawiki/wgs-assembler/index.php?title=Main_Page Wiki] | * [http://apps.sourceforge.net/mediawiki/wgs-assembler/index.php?title=Main_Page Wiki] | ||
cvs -z3 -d:pserver:anonymous@wgs-assembler.cvs.sourceforge.net:/cvsroot/wgs-assembler co -P wgs-assembler | cvs -z3 -d:pserver:anonymous@wgs-assembler.cvs.sourceforge.net:/cvsroot/wgs-assembler co -P wgs-assembler | ||
= Job submission = | |||
cat runCA.sh | cat runCA.sh | ||
touch runCA.log | touch runCA.log | ||
Line 30: | Line 28: | ||
* for mated libs set mean=3000,stdev=300 | * for mated libs set mean=3000,stdev=300 | ||
= Parameters = | |||
* stopAfter: | * stopAfter: | ||
initialStoreBuilding | initialStoreBuilding | ||
Line 43: | Line 41: | ||
* doExtendClearRanges = 2 => cgw run 3 times, ECR 2 times | * doExtendClearRanges = 2 => cgw run 3 times, ECR 2 times | ||
== Log file & runtime (wgs-5.2) == | |||
* Use ~dpuiu/bin/getCAruntimes.pl | * Use ~dpuiu/bin/getCAruntimes.pl | ||
* [[Media:RunCA.log|runCA.log]] [[Media:RunCA.log.times|runCA.times]] bos taurus BCM.CLONEEND 16,875 Sanger reads | * [[Media:RunCA.log|runCA.log]] [[Media:RunCA.log.times|runCA.times]] bos taurus BCM.CLONEEND 16,875 Sanger reads | ||
* [[Media:RunCA.2.log|runCA.log]] [[Media:RunCA.2.times|runCA.times]] wolbachia symbioint of culex pipiens quinquefasciatus 36,767 Sanger reads | * [[Media:RunCA.2.log|runCA.log]] [[Media:RunCA.2.times|runCA.times]] wolbachia symbioint of culex pipiens quinquefasciatus 36,767 Sanger reads | ||
= Gatekeeper = | |||
== | == Build == | ||
== Add == | |||
== Dump == | |||
* sequences | * sequences | ||
Line 65: | Line 63: | ||
? | ? | ||
== Update == | |||
* Using messages : act:R !!! | * Using messages : act:R !!! | ||
Line 83: | Line 81: | ||
gatekeeper --edit prefix.update prefix.gkpStore > prefix.gkpStore.seq | gatekeeper --edit prefix.update prefix.gkpStore > prefix.gkpStore.seq | ||
== | == Delete == | ||
* DST messages can not be deleted | * DST messages can not be deleted | ||
* A FRG can be deleted only if there is no related LKG message; an abbreviated MSG | * A FRG can be deleted only if there is no related LKG message; an abbreviated MSG | ||
Line 97: | Line 95: | ||
} | } | ||
= Terminator = | |||
'''Getting output seq''' | '''Getting output seq''' | ||
<pre> | <pre> | ||
Line 108: | Line 106: | ||
$ cat *.asm | grep -B 8 ^sta:S | grep ^acc | perl -ane '/(\d+)/; print "utg".$1,"\n";' | $ cat *.asm | grep -B 8 ^sta:S | grep ^acc | perl -ane '/(\d+)/; print "utg".$1,"\n";' | ||
= Output processing = | |||
* Get posMap files | * Get posMap files | ||
Line 188: | Line 186: | ||
awk '{print $5,$6}' scf-deg.filter-q.variants.pairs > scf-deg.filter-q.variants.ids | awk '{print $5,$6}' scf-deg.filter-q.variants.pairs > scf-deg.filter-q.variants.ids | ||
= Running subassemblies = | |||
Update frg file: | Update frg file: | ||
* clr=ECR2 | * clr=ECR2 | ||
Line 196: | Line 194: | ||
* doOBT=no | * doOBT=no | ||
= Test datasets = | |||
# Porphyromonas_gingivalis_W83 /fs/szdata/454p/Porphyromonas_gingivalis_W83/ | # Porphyromonas_gingivalis_W83 /fs/szdata/454p/Porphyromonas_gingivalis_W83/ | ||
1 .frg & 4 .sff's | 1 .frg & 4 .sff's |
Revision as of 17:28, 14 August 2009
Sourceforge
cvs -z3 -d:pserver:anonymous@wgs-assembler.cvs.sourceforge.net:/cvsroot/wgs-assembler co -P wgs-assembler
Job submission
cat runCA.sh touch runCA.log echo --- runCA $$ restarted by $USER at `date` --- >>& runCA.log runCA -d . -p prefix -s spec prefix.frg >>& runCA.log
- Sanger
runCA -d . -p prefix -s spec prefix.frg
- doOverlapTrimming=1 (default)
- low qual reads (ex: 20 or 'D') are discarded
- astatLowBound 1 (default) ; can be lowered for low/un-uniform coverage
- astatHighBound 5 (default)
- computeInsertSize=0 (default)
- 454 & Hybrid
runCA -d . overlapper=mer unitigger=bog -p prefix *frg *.sff
- gatekeeper -L : search for 454 paired end linker
- AS_GKP_sff.c : Load_SFF function
- for mated libs set mean=3000,stdev=300
Parameters
- stopAfter:
initialStoreBuilding overlapBasedTrimming | OBT overlapper unitigger consensusAfterUnitigger scaffolder consensusAfterScaffolder
- cgwOutputIntermediate = 1 => "cgw -G"
=> .cgw & .cgw_contigs (generated only at the last doExtendClearRanges step)
- doExtendClearRanges = 2 => cgw run 3 times, ECR 2 times
Log file & runtime (wgs-5.2)
- Use ~dpuiu/bin/getCAruntimes.pl
- runCA.log runCA.times bos taurus BCM.CLONEEND 16,875 Sanger reads
- runCA.log runCA.times wolbachia symbioint of culex pipiens quinquefasciatus 36,767 Sanger reads
Gatekeeper
Build
Add
Dump
- sequences
gatekeeper -dumpfragments -withsequence prefix.gkpStore > prefix.gkpStore.seq
- frg file
gatekeeper -dumpfrg prefix.gkpStore > prefix.gkpStore.frg
- all
gatekeeper -dumpfragments -tabular prefix.gkpStore > prefix.gkpStore.tab
- clear ranges
?
Update
- Using messages : act:R !!!
{DST act:R acc:10691 mea:161662 std:28760 }
- Using tab file
Example: No restrictions on lib stddev with --edit option
cat prefix.update lib uid 19070 mean 167001 lib uid 19070 stddev 167001
gatekeeper --edit prefix.update prefix.gkpStore > prefix.gkpStore.seq
Delete
- DST messages can not be deleted
- A FRG can be deleted only if there is no related LKG message; an abbreviated MSG
{LKG act:D typ:M fg1:94376 fg2:94654 } {FRG act:D acc:94376 }
Terminator
Getting output seq
asmOutputFasta -p prefix < prefix.asm
Getting surrogate ids's
#most of them have negative aStat's $ cat *.asm | grep -B 8 ^sta:S | grep ^acc | perl -ane '/(\d+)/; print "utg".$1,"\n";'
Output processing
- Get posMap files
buildPosMap -o prefix < prefix.asm
- Get scaff files
cavalidate -e 10 prefix bank2scaff prefix.bnk/ > prefix.scaff
~/bin/ca2scaff -i prefix.asm -o prefix.scaff
cat prefix.scaff | grep "^>" | sed 's/>//' | awk '{print $1,$2}' > prefix.scfctg cat prefix.scaff | grep "^>" | sed 's/>//' | awk '{print $1,$3}' > prefix.scflen cat prefix.scaff | grep "^>" | sed 's/>//' | awk '{print $1,$4}' > prefix.scfspan
tac prefix.scaff | p '$count=0 if(/^>/); $count++; print $_ if($count>2);' | awk '{print $4}' # sequence gaps
- Get AGP files
asmToAGP.pl < prefix.asm > prefix.agp
/--- buildPosMap ------> .posmap / .asm /---- asmToAGP ------------------------------------------------> .AGP \ / \---- cavalidate, bank2scaff?--> .scaff --- scaff2agp.pl ---
- Other scripts:
bank2scaff : incorrect gap sizes
valiadteAgp.pl < prefix.agp
infoseq 9-terminator/prefix.ctg.fasta | sed 's/ctg//' >! prefix.ctg.infoseq scaff2agp.pl -i prefix.ctg.infoseq < prefix.scaff > prefix.agp
- Get new read clear ranges (latest=ECR2):
gatekeeper -dumpfragments -tabular -clear [ORIG|QLT|VEC|OBTINI|OBT|UTG|ECR1|ECR2] prefix.gkpStore/ | perl -ane 'print join "\t", @F[0,-2,-1]; print "\n";' > prefix.ECR2.clr
- Get new library insert sizes:
asm2mdi.pl < prefix.asm > prefix.mdi
- Get number of reads/ctg or scf (from the pos files)
cat prefix.posmap.frgctg | count.pl > prefix.posmap.frgctg.count cat prefix.posmap.frgctg | count.pl > prefix.posmap.frgscf.count cat prefix.posmap.ctgscf | count.pl > prefix.posmap.ctgscf.count ...
- Compute bp in scaffolds (remove N's)
- Compute read/mate cvg (from the pos files)
posmap2cvg.pl < prefix.posmap.frgscf > prefix.posmap.frgscf.frg_cvg posmap2cvg.pl -mates prefix.posmap.mates < prefix.posmap.frgscf > prefix.posmap.frgscf.mate_cvg
- Compute ctg cvg(gaps & overlapping ctgs)
posmap2cvg.pl < prefix.posmap.ctgscf > prefix.posmap.ctg_cvg
- Convert ugs for bank features
posmap2feat.pl < prefix.posmap.utgctg > prefix.posmap.utgctg.feat
- Qc stats based on posmap files
posmap2qc.pl -ctgscf prefix.posmap.ctgscf > prefix.qc
- Break scf/ctg pipeline
~/bin/breakPosmapKeep.amos
- Get ctg/deg/scaffold links from the posmap files
~/bin/join12.pl -0 prefix.posmap.mates prefix.posmap.frgscf prefix.posmap.frgdeg >! prefix.posmap.mates2 cat prefix.posmap.mates2 | awk '{print $3,$4,$8}' | p '@F[1,2]=@F[2,1] if ($F[1] gt $F[2]); print join "\t",@F; print "\n"' | count.pl > prefix.posmap.mates2.count cat prefix.posmap.mates2.count | grep -v '\.' | p 'print $_ if($F[1] ne $F[2]);' > prefix.posmap.mates2.count.diff cat prefix.posmap.mates2.count.diff | grep diffScaff | cut -f2,3,4 | ~/bin/cluster.pl > ! prefix.posmap.mates2.count.diffScaff.cluster cat prefix.posmap.mates2.7180002040716-7180002040985 | perl -ane '@F[0,1, 3,4,5,6, 7,8,9,10 ]=@F[1,0, 7,8,9,10, 3,4,5,6, ] if ($F[3] gt $F[7]); print join "\t",@F; print "\n";' | awk '{print $7,$11}' | count.pl
- ALign deg to scaff
nucmer -mum -l 40 -c 250 scf.fasta deg.fasta -p scf-deg delta-filter -q scf-deg.delta > scf-deg.filter-q.delta ~/bin/delta2pairCvg+.pl < scf-deg.filter-q.delta > scf-deg.filter-q.cvg.pairs ~/bin/max2.pl -i 4 -j 6 < scf-deg.filter-q.cvg.pairs > scf-deg.filter-q.maxCvg.pairs cat scf-deg.filter-q.maxCvg.pairs | p 'print $_ if($F[5]-$F[6]<2000);' > scf-deg.filter-q.variants.pairs awk '{print $5,$6}' scf-deg.filter-q.variants.pairs > scf-deg.filter-q.variants.ids
Running subassemblies
Update frg file:
- clr=ECR2
- MDI -> DST
runCA parameters:
- doOBT=no
Test datasets
- Porphyromonas_gingivalis_W83 /fs/szdata/454p/Porphyromonas_gingivalis_W83/
1 .frg & 4 .sff's wgs 5.1 failed in consensus log: 003 failed -- no .success or no .cgi! 012 failed -- no .success or no .cgi! step PostUnitiggerConsensus failed with '2 consensusAfterUnitigger jobs failed. Good luck.'