Skip to content

Commit e22e016

Browse files
authored
Merge branch 'main' into improve_python_dependency
2 parents 89701d4 + 0640845 commit e22e016

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+407
-219
lines changed

docs/articles/semantic_chunking.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1+
# Semantic Chunking
2+
13
<!-- SEO: Explore semantic chunking for Retrieval Augmented Generation (RAG) in this comprehensive guide. Learn about embedding similarity, hierarchical clustering, and LLM-based methods for optimal text chunking. Discover how semantic chunking improves RAG performance compared to traditional rule-based approaches. Includes code examples, evaluation metrics, and comparisons using HotpotQA and SQUAD datasets with BAAI/bge-small-en-v1.5 embeddings.
24
-->
35

4-
# Semantic Chunking
5-
66
Chunking in Natural Language Processing is simply dividing large bodies of text into smaller pieces that computers can manage more easily. Splitting large datasets into chunks enables your Retrieval Augmented Generation (RAG) system to embed, index, and store even very large datasets optimally. But *how* you chunk your data is crucial in determining whether you can efficiently return only the most relevant results to your user queries.
77

88
To get your RAG system to handle user queries better, you need a chunking method that's a good fit for your data. Some widely used chunking algorithms are **rule-based** - e.g., fixed character splitter, recursive character splitter, document-specific splitter, among others. But in some real-world applications, rule-based methods have trouble. If, for example, your dataset has multi-topic documents, rule-based splitting algorithms can result in incomplete contexts or noise-filled chunks. **Semantic chunking**, on the other hand - because it divides text on the basis of meaning rather than rules - creates chunks that are semantically independent and cohesive, and therefore results in more effective text processing and information retrieval.

docs/tools/vdb_table/data/activeloop.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@
157157
"comment": ""
158158
},
159159
"github_stars": {
160-
"value": 8081,
160+
"value": 8101,
161161
"source_url": "https://github.com/activeloopai/deeplake",
162162
"comment": "",
163163
"value_90_days": 0
@@ -169,10 +169,10 @@
169169
"value_90_days": 0
170170
},
171171
"pypi_downloads": {
172-
"value": 917329,
172+
"value": 949405,
173173
"source_url": "https://pypi.org/project/deeplake/",
174174
"comment": "",
175-
"value_90_days": 174429
175+
"value_90_days": 167920
176176
},
177177
"npm_downloads": {
178178
"value": 0,

docs/tools/vdb_table/data/aerospike.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,10 +178,10 @@
178178
"value_90_days": 0
179179
},
180180
"pypi_downloads": {
181-
"value": 1191,
181+
"value": 1671,
182182
"source_url": "https://pypi.org/project/aerospike-vector/",
183183
"comment": "",
184-
"value_90_days": 524
184+
"value_90_days": 926
185185
},
186186
"npm_downloads": {
187187
"value": 0,

docs/tools/vdb_table/data/anariai.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,10 @@
174174
"value_90_days": 0
175175
},
176176
"npm_downloads": {
177-
"value": 4909,
177+
"value": 5096,
178178
"source_url": "https://www.npmjs.com/package/epsillajs",
179179
"comment": "",
180-
"value_90_days": 521
180+
"value_90_days": 608
181181
},
182182
"crates_io_downloads": {
183183
"value": 0,

docs/tools/vdb_table/data/apachecassandra.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -157,33 +157,33 @@
157157
"comment": "via Lucene"
158158
},
159159
"github_stars": {
160-
"value": 8755,
160+
"value": 8793,
161161
"source_url": "https://github.com/apache/cassandra",
162162
"comment": "",
163163
"value_90_days": 0
164164
},
165165
"docker_pulls": {
166-
"value": 215709756,
166+
"value": 216043667,
167167
"source_url": "https://hub.docker.com/_/cassandra",
168168
"comment": "",
169169
"value_90_days": 0
170170
},
171171
"pypi_downloads": {
172-
"value": 83982033,
172+
"value": 85375406,
173173
"source_url": "https://pypi.org/project/cassandra-driver/",
174174
"comment": "",
175-
"value_90_days": 8977518
175+
"value_90_days": 9192091
176176
},
177177
"npm_downloads": {
178-
"value": 4571141,
178+
"value": 4598405,
179179
"source_url": "https://www.npmjs.com/package/cassandra-driver",
180180
"comment": "",
181-
"value_90_days": 884795
181+
"value_90_days": 868693
182182
},
183183
"crates_io_downloads": {
184-
"value": 88518,
184+
"value": 89762,
185185
"source_url": "https://crates.io/crates/cassandra",
186186
"comment": "",
187-
"value_90_days": 974
187+
"value_90_days": 1044
188188
}
189189
}

docs/tools/vdb_table/data/apachesolr.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -157,33 +157,33 @@
157157
"comment": "via Lucene"
158158
},
159159
"github_stars": {
160-
"value": 1178,
160+
"value": 1197,
161161
"source_url": "https://github.com/apache/solr",
162162
"comment": "",
163163
"value_90_days": 0
164164
},
165165
"docker_pulls": {
166-
"value": 343830137,
166+
"value": 344024528,
167167
"source_url": "https://hub.docker.com/_/solr",
168168
"comment": "",
169169
"value_90_days": 0
170170
},
171171
"pypi_downloads": {
172-
"value": 11847305,
172+
"value": 12001490,
173173
"source_url": "https://pypi.org/project/pysolr/",
174174
"comment": "",
175-
"value_90_days": 924379
175+
"value_90_days": 919478
176176
},
177177
"npm_downloads": {
178-
"value": 599287,
178+
"value": 595209,
179179
"source_url": "https://www.npmjs.com/package/solr-client",
180180
"comment": "",
181-
"value_90_days": 109339
181+
"value_90_days": 110632
182182
},
183183
"crates_io_downloads": {
184-
"value": 1483,
184+
"value": 1515,
185185
"source_url": "https://crates.io/crates/solr",
186186
"comment": "",
187-
"value_90_days": 215
187+
"value_90_days": 221
188188
}
189189
}

docs/tools/vdb_table/data/aperturedb.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -168,16 +168,16 @@
168168
"value_90_days": 0
169169
},
170170
"pypi_downloads": {
171-
"value": 153360,
171+
"value": 170148,
172172
"source_url": "https://pypi.org/project/aperturedb/",
173173
"comment": "",
174-
"value_90_days": 19781
174+
"value_90_days": 33621
175175
},
176176
"npm_downloads": {
177-
"value": 18936,
177+
"value": 18587,
178178
"source_url": "https://www.npmjs.com/package/aperture",
179179
"comment": "",
180-
"value_90_days": 1697
180+
"value_90_days": 1644
181181
},
182182
"crates_io_downloads": {
183183
"value": 0,

docs/tools/vdb_table/data/azureai.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,16 +179,16 @@
179179
"value_90_days": 0
180180
},
181181
"pypi_downloads": {
182-
"value": 30230401,
182+
"value": 31788852,
183183
"source_url": "https://pypi.org/project/azure-ai-ml/",
184184
"comment": "",
185-
"value_90_days": 9291085
185+
"value_90_days": 9778906
186186
},
187187
"npm_downloads": {
188-
"value": 6479858,
188+
"value": 6865795,
189189
"source_url": "https://www.npmjs.com/package/@azure/openai",
190190
"comment": "",
191-
"value_90_days": 2155449
191+
"value_90_days": 2258047
192192
},
193193
"crates_io_downloads": {
194194
"value": 0,

docs/tools/vdb_table/data/chroma.json

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@
157157
"comment": ""
158158
},
159159
"github_stars": {
160-
"value": 14811,
160+
"value": 15014,
161161
"source_url": "https://github.com/chroma-core/chroma",
162162
"comment": "",
163163
"value_90_days": 0
@@ -169,21 +169,21 @@
169169
"value_90_days": 0
170170
},
171171
"pypi_downloads": {
172-
"value": 18897962,
172+
"value": 19924126,
173173
"source_url": "https://pypi.org/project/chromadb/",
174174
"comment": "",
175-
"value_90_days": 5238534
175+
"value_90_days": 5528579
176176
},
177177
"npm_downloads": {
178-
"value": 1726093,
178+
"value": 1805790,
179179
"source_url": "https://www.npmjs.com/package/chromadb",
180180
"comment": "",
181-
"value_90_days": 475082
181+
"value_90_days": 494462
182182
},
183183
"crates_io_downloads": {
184-
"value": 13492,
184+
"value": 14791,
185185
"source_url": "https://crates.io/crates/chromadb",
186186
"comment": "",
187-
"value_90_days": 1368
187+
"value_90_days": 1597
188188
}
189189
}

docs/tools/vdb_table/data/clickhouse.json

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@
156156
"comment": "HNSW via USearch"
157157
},
158158
"github_stars": {
159-
"value": 37016,
159+
"value": 37224,
160160
"source_url": "https://github.com/ClickHouse/ClickHouse",
161161
"comment": "",
162162
"value_90_days": 0
@@ -168,21 +168,21 @@
168168
"value_90_days": 0
169169
},
170170
"pypi_downloads": {
171-
"value": 200794,
171+
"value": 202127,
172172
"source_url": "https://pypi.org/project/clickhouse/",
173173
"comment": "",
174-
"value_90_days": 8612
174+
"value_90_days": 8169
175175
},
176176
"npm_downloads": {
177-
"value": 10197127,
177+
"value": 11576731,
178178
"source_url": "https://www.npmjs.com/package/@clickhouse/client",
179179
"comment": "",
180-
"value_90_days": 4165890
180+
"value_90_days": 5018953
181181
},
182182
"crates_io_downloads": {
183-
"value": 439372,
183+
"value": 457291,
184184
"source_url": "https://crates.io/crates/clickhouse",
185185
"comment": "",
186-
"value_90_days": 92556
186+
"value_90_days": 95418
187187
}
188188
}

0 commit comments

Comments
 (0)