Docs in progress

microsoft · Jan 3, 2024 · 50f0dec · 50f0dec
1 parent f5dd753
commit 50f0dec
Show file tree

Hide file tree

Showing 31 changed files with 1,012 additions and 317 deletions.
diff --git a/KernelMemory.sln b/KernelMemory.sln
@@ -24,9 +24,31 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "clients", "clients", "{371B
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "docs", "docs", "{7BA7F1B2-19E2-46EB-B000-513EE2F65769}"
 	ProjectSection(SolutionItems) = preProject
-		docs\FAQ.md = docs\FAQ.md
-		docs\SECURITY_FILTERS.md = docs\SECURITY_FILTERS.md
+		docs\_config.local.yml = docs\_config.local.yml
+		docs\_config.yml = docs\_config.yml
+		docs\csharp.png = docs\csharp.png
+		docs\service.md = docs\service.md
+		docs\serverless.md = docs\serverless.md
+		docs\security.md = docs\security.md
+		docs\run.cmd = docs\run.cmd
+		docs\quickstart.md = docs\quickstart.md
+		docs\quickstart-swagger.png = docs\quickstart-swagger.png
+		docs\quickstart-dotnet-run.png = docs\quickstart-dotnet-run.png
+		docs\python.png = docs\python.png
+		docs\packages.md = docs\packages.md
 		docs\network.png = docs\network.png
+		docs\logo.png = docs\logo.png
+		docs\java.png = docs\java.png
+		docs\index.md = docs\index.md
+		docs\how-tos.md = docs\how-tos.md
+		docs\Gemfile.lock = docs\Gemfile.lock
+		docs\Gemfile = docs\Gemfile
+		docs\features.md = docs\features.md
+		docs\favicon.png = docs\favicon.png
+		docs\FAQ.md = docs\FAQ.md
+		docs\extensions.md = docs\extensions.md
+		docs\concepts.md = docs\concepts.md
+		docs\404.html = docs\404.html
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{0A43C65C-6007-4BB4-B3FE-8D439FC91841}"

diff --git a/docs/_config.local.yml b/docs/_config.local.yml
@@ -1,5 +1,10 @@
+# Config for local Jekyll tests
 theme: just-the-docs
 
+# ==================================================
+# == everything below is the same as _config.yaml ==
+# ==================================================
+
 title: Kernel Memory
 description: >-
   Index and query any data using LLM and natural language, tracking sources and
@@ -10,18 +15,16 @@ description: >-
 baseurl: "/kernel-memory"
 url: ""
 
-plugins:
-  - jekyll-feed
-
 favicon_ico: "/favicon.png"
-search_enabled: true
 enable_copy_code_button: true
 heading_anchors: true
 color_scheme: custom
 
 callouts:
   highlight:
     color: yellow
+  console:
+    color: green
   important:
     title: Important
     color: blue
@@ -36,10 +39,10 @@ callouts:
     color: red
 
 nav_external_links:
-- title: KM on GitHub
-  url: https://github.com/microsoft/kernel-memory
-- title: KM on Discord
-  url: https://aka.ms/KMDiscord
+  - title: KM on GitHub
+    url: https://github.com/microsoft/kernel-memory
+  - title: KM on Discord
+    url: https://aka.ms/KMDiscord
 
 aux_links:
   "GitHub":
@@ -56,3 +59,24 @@ gh_edit_repository: "https://github.com/microsoft/kernel-memory"
 gh_edit_branch: "main"
 gh_edit_source: docs
 gh_edit_view_mode: "tree"
+
+search_enabled: true
+search:
+  # Split pages into sections that can be searched individually # Supports 1 - 6, default: 6
+  heading_level: 6
+  # Maximum amount of previews per search result # Default: 3
+  previews: 3
+  # Maximum amount of words to display before a matched word in the preview # Default: 5
+  preview_words_before: 6
+  # Maximum amount of words to display after a matched word in the preview # Default: 10
+  preview_words_after: 10
+  # Set the search token separator # Default: /[\s\-/]+/ # Example: enable support for hyphenated search words
+  tokenizer_separator: /[\s/]+/
+  # Display the relative url in search results # Supports true (default) or false
+  rel_url: false
+  # Enable or disable the search button that appears in the bottom right corner of every page # Supports true or false (default)
+  button: true
+
+mermaid:
+  # See https://cdn.jsdelivr.net/npm/mermaid/
+  version: "10.6.1"
diff --git a/docs/_config.yml b/docs/_config.yml
@@ -1,5 +1,10 @@
+# Config for GitHub Pages
 remote_theme: just-the-docs/[email protected]
 
+# ========================================================
+# == everything below is the same as _config.local.yaml ==
+# ========================================================
+
 title: Kernel Memory
 description: >-
   Index and query any data using LLM and natural language, tracking sources and
@@ -10,18 +15,16 @@ description: >-
 baseurl: "/kernel-memory"
 url: ""
 
-plugins:
-  - jekyll-feed
-
 favicon_ico: "/favicon.png"
-search_enabled: true
 enable_copy_code_button: true
 heading_anchors: true
 color_scheme: custom
 
 callouts:
   highlight:
     color: yellow
+  console:
+    color: green
   important:
     title: Important
     color: blue
@@ -36,10 +39,10 @@ callouts:
     color: red
 
 nav_external_links:
-- title: KM on GitHub
-  url: https://github.com/microsoft/kernel-memory
-- title: KM on Discord
-  url: https://aka.ms/KMDiscord
+  - title: KM on GitHub
+    url: https://github.com/microsoft/kernel-memory
+  - title: KM on Discord
+    url: https://aka.ms/KMDiscord
 
 aux_links:
   "GitHub":
@@ -56,3 +59,24 @@ gh_edit_repository: "https://github.com/microsoft/kernel-memory"
 gh_edit_branch: "main"
 gh_edit_source: docs
 gh_edit_view_mode: "tree"
+
+search_enabled: true
+search:
+  # Split pages into sections that can be searched individually # Supports 1 - 6, default: 6
+  heading_level: 6
+  # Maximum amount of previews per search result # Default: 3
+  previews: 3
+  # Maximum amount of words to display before a matched word in the preview # Default: 5
+  preview_words_before: 6
+  # Maximum amount of words to display after a matched word in the preview # Default: 10
+  preview_words_after: 10
+  # Set the search token separator # Default: /[\s\-/]+/ # Example: enable support for hyphenated search words
+  tokenizer_separator: /[\s/]+/
+  # Display the relative url in search results # Supports true (default) or false
+  rel_url: false
+  # Enable or disable the search button that appears in the bottom right corner of every page # Supports true or false (default)
+  button: true
+
+mermaid:
+  # See https://cdn.jsdelivr.net/npm/mermaid/
+  version: "10.6.1"
diff --git a/docs/_includes/mermaid_config.js b/docs/_includes/mermaid_config.js
@@ -0,0 +1,3 @@
+{
+    "theme": "forest"
+}
diff --git a/docs/concepts.md b/docs/concepts.md
@@ -1,5 +1,5 @@
 ---
-nav_order: 8
+nav_order: 18
 has_children: true
 title: Concepts
 permalink: /concepts

diff --git a/docs/concepts/cosine-similarity.md b/docs/concepts/cosine-similarity.md
@@ -8,7 +8,7 @@ layout: default
 # Cosine Similarity
 
 Cosine similarity is a measure of the degree of similarity between two vectors in
-a multi-dimensional space. It is commonly used in artificial intelligence and natural
+a multidimensional space. It is commonly used in artificial intelligence and natural
 language processing to compare [embeddings](/concepts/embedding),
 which are numerical representations of
 words or other objects.

diff --git a/docs/concepts/embedding.md b/docs/concepts/embedding.md
@@ -11,27 +11,40 @@ Although in use for quite some time, embeddings have become more popular since t
 launch of GPT-3 thanks to OpenAI LLMs ability to accurately capture the "similarity"
 of sentences, even across different languages.
 
-For instance, using embeddings and cosine similarity, LLMs can infer that
+For instance, using embeddings and [cosine similarity](cosine-similarity), LLMs can infer that
 "*the sun is a star*", "*the moon is a satellite*" and "*la Terre est un corps céleste*"
 are three sentences with a lot in common, they are "close", while a sentence like
-"*fire is a form of energy*" is less similar instead, less close.
-
-One particular note to highlight, embedding are not compatible across LLMs, and their
-ability to detect similarity varies a lot from model to model. We recommend to start
-with OpenAI Ada2, though there are other models that might be less expensive for
-your scenario, or even perform better. The choice depends mostly on the type of
-documents and the number of languages used.
+"*fire is a form of energy*" is less similar instead, less _close_.
 
 Embeddings are a powerful tool for software developers working with artificial intelligence
 and natural language processing. They allow computers to understand the meaning of
 words in a more sophisticated way, by representing them as high-dimensional vectors
-rather than simple strings of characters. Embeddings come in the form of numerical
-vectors, e.g. a list of hundreds of floating numbers, that KM stores in vector storages.
-
-Embeddings work by mapping each word in a vocabulary to a point in a high-dimensional
-space. This space is designed so that words with similar meanings are located near
-each other. This allows algorithms to identify relationships between words, such
-as synonyms or antonyms, without needing explicit rules or human supervision.
+rather than simple strings of characters. Embeddings come in the form of **numerical
+vectors**, e.g. a list of **hundreds of floating numbers**, that KM stores in vector DBs
+such as Azure AI Search, Qdrant, PostgreSQL, etc.
+
+Embeddings work by splitting each word in smaller "tokens", and mapping each known token 
+in a vocabulary to a point in a **high-dimensional space**.
+This space and the token vocabulary are designed so that **words with similar meanings are
+located near each other**.
+This allows algorithms to identify relationships between words, such as synonyms or antonyms,
+without needing explicit rules or human supervision.
+
+One important aspect to consider is that for a given sentence, each LLM produces different
+embeddings, that are not compatible across AI models. So for instance, embeddings generated
+by OpenAI Ada cannot be compared with those generated by Mistral, Cohere, etc.
+Als, LLMs' ability to extract meaning and detect similarity varies a lot from model to model,
+and **some embeddings should not be used for text comparison**, to avoid incoherent results.
+
+Consider looking at [Hugging Face Massive Text Embedding Benchmark (MTEB)](https://huggingface.co/spaces/mteb/leaderboard)
+to discover a list of suitable models, e.g. the Clustering table shows how well models
+can group similar sentences and paragraps. We recommend starting with **OpenAI Ada2** ("text-embedding-ada-002"), 
+currently at position #10 on MTEB, because it's easy to setup and works well across multiple
+languages. 
+
+> ![img.png](img.png)
+>
+> [_MTEB leaderboard, Clustering, January 2024._]
 
 One popular method for creating embeddings is
 Word2Vec [[1]](https://arxiv.org/abs/1301.3781)[[2]](https://arxiv.org/abs/1310.4546),
@@ -46,7 +59,7 @@ document classification, and recommendation systems. They are particularly usefu
 when working with unstructured text data where traditional methods like bag-of-words
 models struggle, and are a fundamental part of Kernel Memory.
 
-Kernel Memory is similar to how the human brain stores and retrieves knowledge about
+Kernel Memory attempts to emulate how the human brain stores and retrieves knowledge about
 the world. Embeddings are used to create a semantic map by **representing concepts
 or entities as vectors in a high-dimensional space**. This approach allows KM
 to learn relationships between concepts and make inferences based on similarity or
@@ -90,6 +103,6 @@ Some examples about embeddings applications.
 
 ## Vector Operations used with Embeddings
 
- - [Cosine Similarity](/concepts/cosine-similarity)
- - [Dot Product](/concepts/dot-product)
- - [Euclidean Distance](/concepts/euclidean-distance)
+ - [Cosine Similarity](cosine-similarity)
+ - [Dot Product](dot-product)
+ - [Euclidean Distance](euclidean-distance)
diff --git a/docs/concepts/img.png b/docs/concepts/img.png
diff --git a/docs/csharp.png b/docs/csharp.png
diff --git a/docs/extensions.md b/docs/extensions.md
@@ -1,5 +1,5 @@
 ---
-nav_order: 4
+nav_order: 14
 has_children: true
 title: Extensions
 permalink: /extensions

diff --git a/docs/extensions/memory-db/dev-tools.md b/docs/extensions/memory-db/dev-tools.md
@@ -1,5 +1,5 @@
 ---
-nav_order: 4
+nav_order: 100
 grand_parent: Extensions
 parent: Memory DBs
 title: Simple memory

diff --git a/docs/extensions/memory-db/elastic-search.md b/docs/extensions/memory-db/elastic-search.md
@@ -0,0 +1,12 @@
+---
+nav_order: 4
+grand_parent: Extensions
+parent: Memory DBs
+title: Elastic Search
+permalink: /extensions/memory-db/elastic-search
+layout: default
+---
+# Elastic Search
+
+{: .highlight }
+documentation under development