Data
Original Traces
- 8 pairs of data files (paired ends)
cat trace.count | grep _1_ | sed 's/_sequence.txt//' | perl -ane 'print " ",$F[1],"\t",$F[0]/4,"\t",$F[0]/2,"\n";'
lib insert mates reads readLen ~coverage(500M genome) reverse adaptors comments
s_2_3kbp 3000 21,563,283 43,126,566 124 11 ? circularizarion
#s_2_5kbp 5000/300 36218589 72,437,178 35 5 yes ? insert size is << 5kbp
s_2_8kbp 8000 198377 396,754 124 0.1 ? ?
s_3 475 35548153 71,096,306 124 18
s_4 475 35471044 70,942,088 124 18
s_5 475 35616846 71,233,692 124 18
s_6 475 35303840 70,607,680 124 18
s_7 475 34893313 69,786,626 124 18
total . 198,594,856 397,189,712
Corrected Traces
lib insert mates reads
s_2_3kb 3000 4,823,235 9,646,470
s_2_8kb 8000 111,267 222,534
s_3 475 33,024,597 66,049,194
s_4 475 33,237,593 66,475,186
s_5 475 33,150,790 66,301,580
s_6 475 33,223,371 66,446,742
s_7 475 32,647,890 65,295,780
total . 170,218,743 340,437,486
Adaptors
>circularizarion
CGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA
>circularizarion.revcomp
TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG
Location
/fs/szattic-asmg5/Bees/Megachile_rotundata/error_correction/large_libs/s_?_?_?kb.sequence.cor.all.txt
/fs/szattic-asmg5/Bees/Megachile_rotundata/error_free/s_?_?_sequence.cor.txt
/fs/szattic-asmg5/Bees/Megachile_rotundata/frg/ # frg files to assemble
Assemblies
- CA Version: 6.1 (09/01/2010) /fs/szdevel/dpuiu/SourceForge/wgs-6.1/Linux-amd64/bin/runCA
- SOAP version 1.04: /nfshomes/dpuiu/szdevel/SOAPdenovo_Release1.04/
CA noOBT partial
- Bog consensus : 4014 error reads in 39 unitigs
grep FAILED 5-consensus/asm*err | wc -l # 4014
grep FAILED 5-consensus/asm*err | awk '{print $3}' | sort -u | wc -l # 39
. elem min q1 q2 q3 max mean n50 sum
utg 1437146 64 123 143 195 67048 308.78 870 443759899
ctg 37494 65 2185 3998 7706 191323 6380.39 10151 239226293
deg 1136469 64 123 143 184 5031 160.11 164 181954480
scf 20827 122 3228 6374 13700 202495 11508.95 20462 239696810
Move the assembly from
mulberry:/scratch2/dpuiu/Megachile_rotundata/Assembly/wgs-noOBT-partial/
to
ginkgo:/scratch1/dpuiu/Megachile_rotundata/Assembly/wgs-noOBT-partial/
CA noOBT
Gatekeeper
LOAD STATS
7 libInput
7 libLoaded
0 libErrors
5 libWarnings
326,236,387 frgLoaded
326236387 numRandom
326236387 numPacked
LibraryName numActiveFRG numDelFRG numMatedFRG readLength clearLength #AATCATACAATCACAATCATAC|GTATGATTGTGATTGTATGATT #AATCATACAATCAC|GTGATTGTATGATT #AATCATACAATCAC|GTGATTGTATGATT(1mutation)
GLOBAL 326,236,387 0 315518526 37451489553 37418130441
LegacyUnmatedReads 0 0 0 0 0
s_2_3kb 9107424 0 9107424 942165284 910444046 #1125738 12% 1366161
s_2_8kb 209336 0 209336 21814418 20787384 #46762 22% 55736(26%) 68729(32%)
s_3 63618839 0 61696784 7343024554 7342819494 # 11%
s_4 63544688 0 61255960 7291557748 7291478152 #6804757 11%
s_5 63370860 0 61084368 7271218123 7271051639 # 11%
s_6 63780887 0 61685156 7359094156 7359012512 # 11%
s_7 62604353 0 60479498 7222615270 7222537214 # 11%
Meryl
meryl -Dh -s 0-mercounts/asm-C-ms22-cm0
Found 30570218845 mers.
Found 271464470 distinct mers.
Found 11164787 unique mers.
Largest mercount is 87984949; 1896 mers are too big for histogram.
1 11164787 0.0411 0.0004
2 9376915 0.0757 0.0010
3 3714582 0.0894 0.0013
...
54 5344148 0.6573 0.1788
...
fasta2tab.pl 0-mercounts/asm.nmers.ovl.fasta | sort -n -r | head -5
87,908,217 AATCATACAATCACAATCATAC
84,450,288 CAATCATACAATCACAATCATA
...
74,975,282 AATAATATGAGTTAGATTGATA
egrep -c 'AATCATACAATCACAATCATAC|GTATGATTGTGATTGTATGATT' *fastq *txt > egrep.count
mulberry:/scratch2/dpuiu/Megachile_rotundata/Data/error_free/egrep.count
meryl -Dh -s 0-mercounts/asm-C-ms15-cm0 | head
Found 32850820919 mers.
Found 142500876 distinct mers.
Found 2381895 unique mers.
Largest mercount is 125816941; 2023 mers are too big for histogram.
1 2381895 0.0167 0.0001
2 2325770 0.0330 0.0002
3 708786 0.0380 0.0003
...
54 1851586 0.4894 0.0671
...
Overlap
cat 1-overlapper/ovlopts.pl | grep ^\"h | wc -l
924
- Failures: 709 jobs failed; runCA 6.1 could not restart overlap properly !!!
cat 1-overlap/overlap*out | grep "^Could not" | sort -u
Could not malloc memory (1305184948 bytes)
cat 1-overlapper/*pl | grep ^\"0 | sed 's/"//' | sed 's/\",//' >! 1-overlapper/ovlopts.pl.0
cat 1-overlapper/*pl | grep ^\"h | sed 's/"//' | sed 's/\",//' >! 1-overlapper/ovlopts.pl.h
cat 1-overlapper/*pl | grep ^\"-h | sed 's/"//' | sed 's/\",//' >! 1-overlapper/ovlopts.pl.-h
paste 1-overlapper/ovlopts.pl.* | p 'print "overlap -M 8GB --hashload 0.8 -t 1 -h $F[3] -r $F[5] -k 22 -k \
./0-mercounts/asm.nmers.ovl.fasta -o ./1-overlapper/$F[0]/$F[1].ovb.gz ./asm.gkpStore > \
./1-overlapper/overlap.0$..out \n";' | tail -709 > overlap.sh
overlapStore -d asm.ovlStore | awk '{print $1}' | uniq -c | awk '{print $1}' | count.pl | getSummary.pl -i 0 -j 1
overlapStats -G asm.gkpStore -O asm.ovlStore -o asm
Location
mulberry:/scratch2/dpuiu/Megachile_rotundata/Assembly/wgs-noOBT
SOAPdenovo (Tanja)
cat *.ContigIndex | grep -v ^E | grep -v ^i | count.pl -i 1 | getSummary.pl -j 1 -t "contigs"
cat *.ContigIndex | grep -v ^E | grep -v ^i | count.pl -i 1 | getSummary.pl -j 1 -min 100 -t "contigs(>100bp)"
grep "^>" *.scaf | getSummary.pl -i 2 -t scaf
. elem min q1 q2 q3 max mean n50 sum
contigs 9742349 31 32 33 37 114832 60.09 44 585430821
contigs(>100bp) 177327 100 131 261 1398 114832 1333.68 3897 236496823 # N50 for Bee was 7K
scaf 7863 102 903 3272 17692 2338728 37825.70 240706 297423517 # N50 for Bee was 1.17M
/fs/szattic-asmg5/Bees/Megachile_rotundata/Assembly/assembly5kbForAll
SOAPdenovo (Daniela)
cat asm.K31.contig | grep "^>" | awk '{print $3}' | uniq -c | awk '{print $2,$1}' > asm.K31.contigLen.count
. elem min q1 q2 q3 max mean n50 sum
contigs(all) 6,917,796 31 32 34 40 121554 70.46 73 487,401,812
contigs(>100bp) 210,666 100 124 222 1174 121554 1108.69 3138 233,563,401
scaff 25,119 351 1896 4444 10914 1102803 11041.00 26876 277,338,897
reads 340,437,486
readsOnContigs 171,212,613
mulberry:/scratch2/dpuiu/Megachile_rotundata/Assembly/SOAPdenovo-redo