Data
Original Traces
- 8 pairs of data files (paired ends)
cat trace.count | grep _1_ | sed 's/_sequence.txt//' | perl -ane 'print " ",$F[1],"\t",$F[0]/4,"\t",$F[0]/2,"\n";'
lib insert mates reads readLen ~coverage(500M genome) reverse adaptors comments
s_2_3kbp 3000 21,563,283 43,126,566 124 11 ? circularizarion
#s_2_5kbp 5000/300 36218589 72,437,178 35 5 yes ? insert size is << 5kbp
s_2_8kbp 8000 198377 396,754 124 0.1 ? ?
s_3 475 35548153 71,096,306 124 18
s_4 475 35471044 70,942,088 124 18
s_5 475 35616846 71,233,692 124 18
s_6 475 35303840 70,607,680 124 18
s_7 475 34893313 69,786,626 124 18
total . 198,594,856 397,189,712
Corrected Traces
lib insert mates reads
s_2_3kb 3000 4,823,235 9,646,470
s_2_8kb 8000 111,267 222,534
s_3 475 33,024,597 66,049,194
s_4 475 33,237,593 66,475,186
s_5 475 33,150,790 66,301,580
s_6 475 33,223,371 66,446,742
s_7 475 32,647,890 65,295,780
total . 170,218,743 340,437,486
Adaptors
>circularizarion
CGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA
>circularizarion.revcomp
TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG
Location
/fs/szattic-asmg5/Bees/Megachile_rotundata/error_correction/large_libs/s_?_?_?kb.sequence.cor.all.txt
/fs/szattic-asmg5/Bees/Megachile_rotundata/error_free/s_?_?_sequence.cor.txt
/fs/szattic-asmg5/Bees/Megachile_rotundata/frg/ # frg files to assemble
Assemblies
- CA Version: 6.1 (09/01/2010) /fs/szdevel/dpuiu/SourceForge/wgs-6.1/Linux-amd64/bin/runCA
- SOAP version 1.04: /nfshomes/dpuiu/szdevel/SOAPdenovo_Release1.04/
CA noOBT partial
LibraryName numActiveFRG numDeletedFRG numMatedFRG readLength clearLength
GLOBAL 72995448 0 70632632 8307194830 8278360381
LegacyUnmatedReads 0 0 0 0 0
s_2_3kb 9166343 0 8736228 942501164 914798596
s_2_8kb 210266 0 199620 21669112 20742291
s_3 63618839 0 61696784 7343024554 7342819494
. elem min q1 q2 q3 max mean n50 sum
utg 1437146 64 123 143 195 67048 308 870 443759899
ctg 37494 65 2185 3998 7706 191323* 6380 10151* 239226293
deg 1136469 64 123 143 184 5031 160 164 181954480
scf 20827 122 3228 6374 13700 202495 11508 20462 239696810
reads 72,995,448
singletons 34,881,692 (47%)
ginkgo:/scratch1/dpuiu/Megachile_rotundata/Assembly/wgs-noOBT-partial/
Moved from
mulberry:/scratch2/dpuiu/Megachile_rotundata/Assembly/wgs-noOBT-partial/
CA noOBT
Gatekeeper
LibraryName numActiveFRG numDelFRG numMatedFRG readLength clearLength #repeats
GLOBAL 326,236,387 0 315518526 37451489553 37418130441
LegacyUnmatedReads 0 0 0 0 0
s_2_3kb 9107424 0 9107424 942165284 910444046 #
s_2_8kb 209336 0 209336 21814418 20787384 #
s_3 63618839 0 61696784 7343024554 7342819494 #
s_4 63544688 0 61255960 7291557748 7291478152 #
s_5 63370860 0 61084368 7271218123 7271051639 #
s_6 63780887 0 61685156 7359094156 7359012512 #
s_7 62604353 0 60479498 7222615270 7222537214 #
Meryl
meryl -Dh -s 0-mercounts/asm-C-ms22-cm0
Found 30570218845 mers.
Found 271464470 distinct mers.
Found 11164787 unique mers.
Largest mercount is 87984949; 1896 mers are too big for histogram.
1 11164787 0.0411 0.0004
2 9376915 0.0757 0.0010
3 3714582 0.0894 0.0013
...
54 5344148 0.6573 0.1788
...
fasta2tab.pl 0-mercounts/asm.nmers.ovl.fasta | sort -n -r | head -5
87,908,217 AATCATACAATCACAATCATAC
84,450,288 CAATCATACAATCACAATCATA
...
74,975,282 AATAATATGAGTTAGATTGATA
egrep -c 'AATCATACAATCACAATCATAC|GTATGATTGTGATTGTATGATT' *fastq *txt > egrep.count
mulberry:/scratch2/dpuiu/Megachile_rotundata/Data/error_free/egrep.count
meryl -Dh -s 0-mercounts/asm-C-ms15-cm0 | head
Found 32850820919 mers.
Found 142500876 distinct mers.
Found 2381895 unique mers.
Largest mercount is 125816941; 2023 mers are too big for histogram.
1 2381895 0.0167 0.0001
2 2325770 0.0330 0.0002
3 708786 0.0380 0.0003
...
54 1851586 0.4894 0.0671
...
Overlap
cat 1-overlapper/ovlopts.pl | grep ^\"h | wc -l
924
- Failures: 709 jobs failed; runCA 6.1 could not restart overlap properly !!!
cat 1-overlap/overlap*out | grep "^Could not" | sort -u
Could not malloc memory (1305184948 bytes)
cat 1-overlapper/*pl | grep ^\"0 | sed 's/"//' | sed 's/\",//' >! 1-overlapper/ovlopts.pl.0
cat 1-overlapper/*pl | grep ^\"h | sed 's/"//' | sed 's/\",//' >! 1-overlapper/ovlopts.pl.h
cat 1-overlapper/*pl | grep ^\"-h | sed 's/"//' | sed 's/\",//' >! 1-overlapper/ovlopts.pl.-h
paste 1-overlapper/ovlopts.pl.* | p 'print "overlap -M 8GB --hashload 0.8 -t 1 -h $F[3] -r $F[5] -k 22 -k \
./0-mercounts/asm.nmers.ovl.fasta -o ./1-overlapper/$F[0]/$F[1].ovb.gz ./asm.gkpStore > \
./1-overlapper/overlap.0$..out \n";' | tail -709 > overlap.sh
overlapStore -d asm.ovlStore | awk '{print $1}' | uniq -c | awk '{print $1}' | count.pl | getSummary.pl -i 0 -j 1
overlapStats -G asm.gkpStore -O asm.ovlStore -o asm
Location
mulberry:/scratch2/dpuiu/Megachile_rotundata/Assembly/wgs-noOBT
SOAPdenovo (Tanja)
cat *.ContigIndex | grep -v ^E | grep -v ^i | count.pl -i 1 | getSummary.pl -j 1 -t "contigs"
cat *.ContigIndex | grep -v ^E | grep -v ^i | count.pl -i 1 | getSummary.pl -j 1 -min 100 -t "contigs(>100bp)"
grep "^>" *.scaf | getSummary.pl -i 2 -t scaf
. elem min q1 q2 q3 max mean n50 sum
contigs 9742349 31 32 33 37 114832 60 44 585430821
contigs(>100bp) 177327 100 131 261 1398 114832 1333 3897 236496823 # N50 for Bee was 7K
scaf 7863 102 903 3272 17692 2338728 37825 240706 297423517 # N50 for Bee was 1.17M
/fs/szattic-asmg5/Bees/Megachile_rotundata/Assembly/assembly5kbForAll
SOAPdenovo (Daniela)
cat asm.K31.contig | grep "^>" | awk '{print $3}' | uniq -c | awk '{print $2,$1}' > asm.K31.contigLen.count
. elem min q1 q2 q3 max mean n50 sum
contigs(all) 6,917,796 31 32 34 40 121554 70 73 487,401,812
contigs(>100bp) 210,666 100 124 222 1174 121554 1108 3138 233,563,401
scaff 25,119 351 1896 4444 10914 1102803 11041 26876 277,338,897
reads 340,437,486
readsOnContigs 171,212,613
mulberry:/scratch2/dpuiu/Megachile_rotundata/Assembly/SOAPdenovo-redo