From f0a22c7687c69bf0b93cad50660b6e96453eef52 Mon Sep 17 00:00:00 2001
From: Wei Shen
Date: Mon, 19 Aug 2024 16:41:12 +0100
Subject: [PATCH] update docs
---
introduction/index.html | 15 +-
overview.svg | 554 +++++++++++++------------------------
releases/index.html | 10 +-
search/en.data.min.json | 2 +-
searching.svg | 518 ++++++++++++----------------------
tutorials/index/index.html | 9 +-
6 files changed, 398 insertions(+), 710 deletions(-)
diff --git a/introduction/index.html b/introduction/index.html
index 8d30ebf..f8a63b8 100644
--- a/introduction/index.html
+++ b/introduction/index.html
@@ -62,7 +62,7 @@
"url" : "https://bioinf.shenwei.me/LexicMap/introduction/",
"headline": "Introduction",
"description": "LexicMap is a nucleotide sequence alignment tool for efficiently querying gene, plasmid, viral, or long-read sequences against up to millions of prokaryotic genomes.\nTable of contents Table of contents Features Introduction Quick start Performance Indexing Searching Installation Algorithm overview Related projects Support License Features LexicMap is scalable to up to millions of prokaryotic genomes. The sensitivity of LexicMap is comparable with Blastn. The alignment is fast and memory-efficient. LexicMap is easy to install, we provide binary files with no dependencies for Linux, Windows, MacOS (x86 and arm CPUs).",
- "wordCount" : "1614",
+ "wordCount" : "1633",
"inLanguage": "en",
"isFamilyFriendly": "true",
"mainEntityOfPage": {
@@ -1542,7 +1542,7 @@ Introduction
We added the support of suffix matching of seeds, making seeds much more tolerant to mutations. Any 31-bp seed with a common ≥15 bp prefix or suffix can be matched, which means seeds are immune to any single SNP.
-A hierarchical index enables fast and low-memory variable-length seed matching and chaining.
+A hierarchical index enables fast and low-memory variable-length seed matching (prefix + suffix matching).
A pseudo alignment algorithm is used to find similar sequence regions from chaining results for alignment.
A Introduction
LexicMap enables efficient indexing and searching of both RefSeq+GenBank and the AllTheBacteria datasets (2.3 and 1.9 million genomes respectively).
+>AllTheBacteria
datasets (2.3 and 1.9 million prokaryotic assemblies respectively).
Running at this scale has previously only been achieved by Phylign (previously called mof-search).
+>Phylign (previously called mof-search), which compresses genomes with phylogenetic information and provides searching
+(prefiltering with COBS and alignment with minimap2).
For searching in all 2,340,672 Genbank+Refseq prokaryotic genomes, Bastn is unable to run with this dataset on common servers as it requires >2000 GB RAM. (see
+ id="defs283" />
+ inkscape:current-layer="g283" />
1
- CA
+ id="text122">1 CA
A
+ id="text123">A
ACC
+ id="text124">ACC
2
- CA
+ id="text125">2 CA
C
+ id="text126">C
AGA
+ id="text127">AGA
3
- CA
+ id="text128">3 CA
G
+ id="text129">G
CAC
+ id="text130">CAC
4
- CA
+ id="text131">4 CA
T
+ id="text132">T
C
- AC
+ id="text133">CAC
5
+ id="text134">5
CATG
+ id="text135">CATG
AT
+ id="text136">AT
6
+ id="text137">6
CATG
+ id="text138">CATG
GT
+ id="text139">GT
7 CATTGA
+ id="text140">7 CATTGA
+ id="rect140" />
+ id="rect141" />
+ id="rect142" />
+ id="rect143" />
k
+ id="text143">k
-
+ id="text144">-
mer information
+ id="text145">mer information
(64
+ id="text146">(64
-
+ id="text147">-
bit)
+ id="text148">bit)
Genome batch ID (17
+ id="text149">Genome batch ID (17
-
+ id="text150">-
bit)
+ id="text151">bit)
+ id="path151" />
+ id="path152" />
Bit
+ id="text152">Bit
-
+ id="text153">-
packed
+ id="text154">packed
Genomes batch
+ id="text155">Genomes batch
b
+ id="text156">b
···
+ id="text157">···
Genome ID (17
+ id="text158">Genome ID (17
-
+ id="text159">-
bit)
+ id="text160">bit)
Position (28
+ id="text161">Position (28
-
+ id="text162">-
bit)
+ id="text163">bit)
Strand (1
+ id="text164">Strand (1
-
+ id="text165">-
bit)
+ id="text166">bit)
+ id="path166" />
+ id="path167" />
+ id="path168" />
+ id="rect168" />
+ id="rect169" />
+ id="rect170" />
+ id="rect171" />
+ id="rect172" />
+ id="rect173" />
+ id="rect174" />
+ id="rect175" />
+ id="rect176" />
+ id="rect177" />
+ id="rect178" />
+ id="rect179" />
+ id="rect180" />
+ id="rect181" />
+ id="rect182" />
+ id="rect183" />
+ id="rect184" />
+ id="rect185" />
+ id="rect186" />
+ id="rect187" />
+ id="rect188" />
+ id="rect189" />
+ id="rect190" />
+ id="rect191" />
+ id="path191" />
CA
+ id="text191">CA
TGCT
+ id="text192">TGCT
The
+ id="text193">The
k
+ id="text194">k
-
+ id="text195">-
mer
+ id="text196">mer
captured by
+ id="text197">captured by
probe
+ id="text198">probe
c’
+ id="text199">c’
CATG
+ id="text200">CATG
AA
+ id="text201">AA
CATG
+ id="text202">CATG
AC
+ id="text203">AC
CATG
+ id="text204">CATG
TG
+ id="text205">TG
CATG
+ id="text206">CATG
TT
+ id="text207">TT
···
+ id="text208">···
+ id="path208" />
Probe prefix = 2
+ id="text209">Probe prefix = 2
M
- in
- prefix
- =
- 4
+ id="text210">Min prefix = 4
+ id="path210" />
Set search
+ id="text211">Set search
range
+ id="text212">range
Find scanning
+ id="text213">Find scanning
start position
+ id="text214">start position
Variable
+ id="text215">Variable
-
+ id="text216">-
length seed matching
+ id="text217">length seed matching
+ id="path217" />
+ id="path218" />
Bit
+ id="text218">Bit
-
+ id="text219">-
packed
+ id="text220">packed
genome
+ id="text221">genome
batches
+ id="text222">batches
b
+ id="text223">b
a
+ id="text224">a
+ id="path224" />
Offsets in the seed data file
+ id="text225">Offsets in the seed data file
c’
+ id="text226">c’
(
+ id="text227">(
CA
+ id="text228">CA
A
+ id="text229">A
ACC
- ,
- 1)
+ id="text230">ACC, 1)
(
+ id="text231">(
CA
+ id="text232">CA
C
+ id="text233">C
AGA
- ,
- 2
- )
+ id="text234">AGA, 2)
(
+ id="text235">(
CA
+ id="text236">CA
G
+ id="text237">G
CAC
- ,
- 3
- )
+ id="text238">CAC, 3)
(
+ id="text239">(
CA
+ id="text240">CA
T
+ id="text241">T
CAC
- ,
- 4
- )
+ id="text242">CAC, 4)
CATG
+ id="text243">CATG
AA
+ id="text244">AA
+ id="path244" />
+ id="path245" />
+ id="path246" />
+ id="path247" />
+ id="rect247" />
+ id="path248" />
+ id="path249" />
In
+ id="text249">In
-
+ id="text250">-
RAM querying of the
- start
- position
+ id="text251">RAM querying of the start position
for scanning in
- the
- seed
- data file
+ id="text252">for scanning in the seed data file
Scanning records on disk
+ id="text253">Scanning records on disk
+ id="rect253" />
+ id="rect254" />
+ id="rect255" />
+ id="rect256" />
+ id="rect257" />
+ id="rect258" />
+ id="rect259" />
Seed direction (1
+ id="text259">Seed direction (1
-
+ id="text260">-
bit)
+ id="text261">bit)
+ id="path261" />
T
+ id="text262">T
able
+ id="text263">able
o
+ id="text264">o
f seed
+ id="text265">f seed
C
+ id="text266">C
ontent
- (
+ id="text267">ontent (
ToC
+ id="text268">ToC
)
+ id="text269">)
Scan
+ id="text270">Scan
from
+ id="text271">from
here
+ id="text272">here
CA
+ id="text273">CA
T
+ id="text274">T
+ id="path274" />
+ id="path275" />
+ id="path276" />
p’
+ id="text276">p’
-
+ id="text277">-
bp subsequence for
+ id="text278">bp subsequence for
choosing marker
+ id="text279">choosing marker
k
+ id="text280">k
-
+ id="text281">-
mers
+ id="text282">mers
+ id="path282" />
+ id="path283" />
diff --git a/releases/index.html b/releases/index.html
index 9d13d3b..acc472c 100644
--- a/releases/index.html
+++ b/releases/index.html
@@ -12,7 +12,7 @@
-
+
Releases | LexicMap: efficient sequence alignment against millions of prokaryotic genomes
@@ -38,7 +38,7 @@
content="Releases"
/>
-
+
@@ -47,7 +47,7 @@
-
+