From 47f7b1a7a3c7918bdc27f855452b8fef7398549e Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Fri, 10 Jan 2025 18:03:14 +0800 Subject: [PATCH] full text tmp code (#3561) * full text tmp code * fix: init * fix: init * remove tmp code * remove tmp code * 4818-alpha --- .../core/dataset/data/dataTextSchema.ts | 7 +- packages/service/core/dataset/data/schema.ts | 9 +-- projects/app/src/pages/api/admin/initv4818.ts | 74 +++++++++++-------- .../service/core/dataset/data/controller.ts | 4 +- 4 files changed, 48 insertions(+), 46 deletions(-) diff --git a/packages/service/core/dataset/data/dataTextSchema.ts b/packages/service/core/dataset/data/dataTextSchema.ts index a3c57c2b2d4..e0564ed2b61 100644 --- a/packages/service/core/dataset/data/dataTextSchema.ts +++ b/packages/service/core/dataset/data/dataTextSchema.ts @@ -33,12 +33,7 @@ const DatasetDataTextSchema = new Schema({ }); try { - DatasetDataTextSchema.index( - { teamId: 1, datasetId: 1, fullTextToken: 'text' }, - { - partialFilterExpression: { fullTextToken: { $exists: true } } - } - ); + DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' }); DatasetDataTextSchema.index({ dataId: 1 }, { unique: true }); } catch (error) { console.log(error); diff --git a/packages/service/core/dataset/data/schema.ts b/packages/service/core/dataset/data/schema.ts index 3b7ac881942..85dd8a7d22e 100644 --- a/packages/service/core/dataset/data/schema.ts +++ b/packages/service/core/dataset/data/schema.ts @@ -1,4 +1,4 @@ -import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo'; +import { connectionMongo, getMongoModel } from '../../../common/mongo'; const { Schema, model, models } = connectionMongo; import { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type.d'; import { @@ -70,10 +70,7 @@ const DatasetDataSchema = new Schema({ rebuilding: Boolean, // Abandon - fullTextToken: { - type: String, - default: '' - }, + fullTextToken: String, initFullText: Boolean }); @@ -87,7 +84,7 @@ try { updateTime: -1 }); // FullText tmp full text index - DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' }); + // DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' }); // Recall vectors after data matching DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 }); DatasetDataSchema.index({ updateTime: 1 }); diff --git a/projects/app/src/pages/api/admin/initv4818.ts b/projects/app/src/pages/api/admin/initv4818.ts index 196ecd5899d..12f40b3e771 100644 --- a/projects/app/src/pages/api/admin/initv4818.ts +++ b/projects/app/src/pages/api/admin/initv4818.ts @@ -1,6 +1,7 @@ import { NextAPI } from '@/service/middleware/entry'; import { delay } from '@fastgpt/global/common/system/utils'; import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; +import { jiebaSplit } from '@fastgpt/service/common/string/jieba'; import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema'; import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema'; import { authCert } from '@fastgpt/service/support/permission/auth/common'; @@ -23,16 +24,36 @@ async function handler(req: NextApiRequest, res: NextApiResponse) { const start = Date.now(); await initData(batchSize); + // await restore(); console.log('Init data time:', Date.now() - start); success = 0; - // await batchUpdateFields(); + batchUpdateFields(); return { success: true }; } export default NextAPI(handler); +const restore = async () => { + try { + const data = await MongoDatasetData.findOne({ fullTextToken: { $exists: false } }); + if (!data) return; + + data.fullTextToken = jiebaSplit({ text: `${data.q}\n${data.a}`.trim() }); + await data.save(); + + success++; + console.log('Success:', success); + + await restore(); + } catch (error) { + console.log(error); + await delay(500); + await restore(); + } +}; + const initData = async (batchSize: number) => { try { // 找到没有初始化的数据 @@ -59,46 +80,35 @@ const initData = async (batchSize: number) => { })), { ordered: false, session, lean: true } ); + // FullText tmp 把成功插入的新数据的 dataId 更新为已初始化 - // await MongoDatasetData.updateMany( - // { _id: { $in: result.map((item) => item.dataId) } }, - // { $set: { initFullText: true }, $unset: { fullTextToken: 1 } }, - // { session } - // ); + await MongoDatasetData.updateMany( + { _id: { $in: result.map((item) => item.dataId) } }, + { $set: { initFullText: true } }, + { session } + ); success += result.length; - console.log('Success:', success); }); await initData(batchSize); - } catch (error) { - console.log(error, '---'); + } catch (error: any) { + console.log(error, '==='); await delay(500); await initData(batchSize); } }; -// const batchUpdateFields = async (batchSize = 2000) => { -// // Find documents that still have these fields -// const documents = await MongoDatasetData.find({ initFullText: { $exists: true } }, '_id') -// .limit(batchSize) -// .lean(); - -// if (documents.length === 0) return; - -// // Update in batches -// await MongoDatasetData.updateMany( -// { _id: { $in: documents.map((doc) => doc._id) } }, -// { -// $unset: { -// initFullText: 1 -// // fullTextToken: 1 -// } -// } -// ); - -// success += documents.length; -// console.log('Delete success:', success); -// await batchUpdateFields(batchSize); -// }; +const batchUpdateFields = async (batchSize = 2000) => { + // Update in batches + await MongoDatasetData.updateMany( + { initFullText: { $exists: true } }, + { + $unset: { + initFullText: 1, + fullTextToken: 1 + } + } + ); +}; diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index c3a607d77b3..d07753da6b0 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -90,7 +90,7 @@ export async function insertData2Dataset({ q, a, // FullText tmp - fullTextToken: jiebaSplit({ text: qaStr }), + // fullTextToken: jiebaSplit({ text: qaStr }), chunkIndex, indexes: indexes?.map((item, i) => ({ ...item, @@ -243,7 +243,7 @@ export async function updateData2Dataset({ mongoData.q = q || mongoData.q; mongoData.a = a ?? mongoData.a; // FullText tmp - mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() }); + // mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() }); // @ts-ignore mongoData.indexes = newIndexes; await mongoData.save({ session });