Skip to content

Commit

Permalink
feat(kb): Generating embedding chunks and vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
RezaRahemtola committed Aug 6, 2024
1 parent dd3de86 commit bb05f2a
Show file tree
Hide file tree
Showing 8 changed files with 1,045 additions and 582 deletions.
1,529 changes: 961 additions & 568 deletions package-lock.json

Large diffs are not rendered by default.

21 changes: 11 additions & 10 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,42 +25,43 @@
"@aleph-sdk/message": "^1.0.7",
"@libertai/libertai-js": "0.0.9",
"@quasar/extras": "^1.16.12",
"@tanstack/vue-query": "^5.51.17",
"@tanstack/vue-query": "^5.51.21",
"@wagmi/vue": "^0.0.34",
"axios": "^1.7.3",
"dayjs": "^1.11.12",
"dompurify": "^3.1.6",
"eciesjs": "^0.4.7",
"filesize": "^10.1.4",
"highlight.js": "^11.10.0",
"langchain": "^0.2.12",
"localforage": "^1.10.0",
"marked": "^12.0.2",
"marked": "^13.0.3",
"marked-highlight": "^2.1.3",
"mime": "^4.0.4",
"pdfjs-dist": "^4.5.136",
"pinia": "^2.2.0",
"pinia": "^2.2.1",
"pinia-plugin-persistedstate": "^3.2.1",
"quasar": "^2.16.6",
"quasar": "^2.16.7",
"stream": "^0.0.3",
"uuid": "^10.0.0",
"viem": "^2.18.7",
"vue": "^3.4.35",
"viem": "^2.18.8",
"vue": "^3.4.36",
"vue-router": "^4.4.2",
"web3": "^4.11.1",
"zod": "^3.23.8"
},
"devDependencies": {
"@quasar/app-vite": "^1.9.3",
"@quasar/app-vite": "^1.9.4",
"@types/dompurify": "^3.0.5",
"@types/node": "^20.14.13",
"@typescript-eslint/eslint-plugin": "^7.18.0",
"@typescript-eslint/parser": "^7.18.0",
"@typescript-eslint/eslint-plugin": "^8.0.1",
"@typescript-eslint/parser": "^8.0.1",
"@vue/eslint-config-typescript": "^13.0.0",
"autoprefixer": "^10.4.20",
"eslint": "^8.57.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-vue": "^9.27.0",
"postcss": "^8.4.40",
"postcss": "^8.4.41",
"prettier": "^3.3.3",
"tailwindcss": "^3.4.7",
"typescript": "^5.5.4",
Expand Down
1 change: 0 additions & 1 deletion src/components/MarkdownRenderer.vue
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ const marked = new Marked(
const renderedContent = ref('');
async function updateContent(content: string) {
// content = DOMPurify.sanitize(content);
renderedContent.value = DOMPurify.sanitize(await marked.parse(content));
}
Expand Down
2 changes: 1 addition & 1 deletion src/stores/old-knowledge.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { defineStore } from 'pinia';
import { Document, KnowledgeStore } from '@libertai/libertai-js';
import { defaultKnowledge } from '../utils/knowledge';
import { defaultKnowledge } from '../utils/knowledge/default';
import { v4 as uuidv4 } from 'uuid';

export const DEFAULT_KNOWLEDGE_TAG = 'default';
Expand Down
9 changes: 8 additions & 1 deletion src/types/knowledge.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import { z } from 'zod';

const knowledgeDocumentChunk = z.object({
content: z.string(),
vector: z.array(z.number()),
});

export type KnowledgeDocumentChunk = z.infer<typeof knowledgeDocumentChunk>

const knowledgeDocumentSchema = z.object({
id: z.string().uuid(),
name: z.string(),
type: z.string(),
content: z.string(),
chunks: z.array(knowledgeDocumentChunk),
size: z.number(),
store: z.object({
item_hash: z.string(),
Expand Down
File renamed without changes.
5 changes: 4 additions & 1 deletion src/utils/knowledge/document.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ import { v4 as uuidv4 } from 'uuid';

import { KnowledgeDocument } from 'src/types/knowledge';
import { extractFileContent } from 'src/utils/knowledge/parsing';
import { generateChunks } from 'src/utils/knowledge/embedding';

export const processDocument = async (file: File): Promise<Omit<KnowledgeDocument, 'store'>> => {
const fileInfo = await extractFileContent(file);

return { ...fileInfo, id: uuidv4(), name: file.name, size: file.size };
const chunks = await generateChunks(file.name, fileInfo.content);

return { ...fileInfo, id: uuidv4(), name: file.name, size: file.size, chunks };
};
60 changes: 60 additions & 0 deletions src/utils/knowledge/embedding.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import axios from 'axios';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { KnowledgeDocumentChunk } from 'src/types/knowledge';

const DEFAULT_EMBEDDING_API_URL =
'https://curated.aleph.cloud/vm/ee1b2a8e5bd645447739d8b234ef495c9a2b4d0b98317d510a3ccf822808ebe5/embedding';

export const generateChunks = async (
title: string,
content: string,
chunkSize = 500,
overlapSize = 100,
): Promise<KnowledgeDocumentChunk[]> => {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: chunkSize,
chunkOverlap: overlapSize,
separators: ['\n\n---\n\n', '\n\n', '\n', ' '],
});

// Split into a list of LangChain documents
const documents = await splitter.createDocuments(
[content],
// TODO: include metadata
[],
{
chunkHeader: `DOCUMENT TITLE: ${title}\n\n---\n\n`,
appendChunkOverlapHeader: true,
},
);
return await Promise.all(
documents.map(
async (d): Promise<KnowledgeDocumentChunk> => ({
content: d.pageContent,
vector: await embed(d.pageContent),
}),
),
);
};

async function embed(content: string): Promise<number[]> {
const tries = 3;
let timeout = 1000;

const errors = [];
for (let i = 0; i < tries; i++) {
try {
const response = await axios.post<{ embedding: number[] }>(DEFAULT_EMBEDDING_API_URL, {
content,
});

return response.data.embedding;
} catch (error) {
errors.push(error);
console.error(`Error embedding text: ${error}`);
await new Promise((resolve) => setTimeout(resolve, timeout));
timeout *= 2;
}
}
return [];
}

0 comments on commit bb05f2a

Please sign in to comment.