Skip to content

Commit

Permalink
full text tmp code (#3561)
Browse files Browse the repository at this point in the history
* full text tmp code

* fix: init

* fix: init

* remove tmp code

* remove tmp code

* 4818-alpha
  • Loading branch information
c121914yu authored Jan 10, 2025
1 parent fadb3e3 commit 47f7b1a
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 46 deletions.
7 changes: 1 addition & 6 deletions packages/service/core/dataset/data/dataTextSchema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,7 @@ const DatasetDataTextSchema = new Schema({
});

try {
DatasetDataTextSchema.index(
{ teamId: 1, datasetId: 1, fullTextToken: 'text' },
{
partialFilterExpression: { fullTextToken: { $exists: true } }
}
);
DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
} catch (error) {
console.log(error);
Expand Down
9 changes: 3 additions & 6 deletions packages/service/core/dataset/data/schema.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
import { connectionMongo, getMongoModel } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type.d';
import {
Expand Down Expand Up @@ -70,10 +70,7 @@ const DatasetDataSchema = new Schema({
rebuilding: Boolean,

// Abandon
fullTextToken: {
type: String,
default: ''
},
fullTextToken: String,
initFullText: Boolean
});

Expand All @@ -87,7 +84,7 @@ try {
updateTime: -1
});
// FullText tmp full text index
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
// DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
// Recall vectors after data matching
DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
DatasetDataSchema.index({ updateTime: 1 });
Expand Down
74 changes: 42 additions & 32 deletions projects/app/src/pages/api/admin/initv4818.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { NextAPI } from '@/service/middleware/entry';
import { delay } from '@fastgpt/global/common/system/utils';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { jiebaSplit } from '@fastgpt/service/common/string/jieba';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
Expand All @@ -23,16 +24,36 @@ async function handler(req: NextApiRequest, res: NextApiResponse) {

const start = Date.now();
await initData(batchSize);
// await restore();
console.log('Init data time:', Date.now() - start);

success = 0;
// await batchUpdateFields();
batchUpdateFields();

return { success: true };
}

export default NextAPI(handler);

const restore = async () => {
try {
const data = await MongoDatasetData.findOne({ fullTextToken: { $exists: false } });
if (!data) return;

data.fullTextToken = jiebaSplit({ text: `${data.q}\n${data.a}`.trim() });
await data.save();

success++;
console.log('Success:', success);

await restore();
} catch (error) {
console.log(error);
await delay(500);
await restore();
}
};

const initData = async (batchSize: number) => {
try {
// 找到没有初始化的数据
Expand All @@ -59,46 +80,35 @@ const initData = async (batchSize: number) => {
})),
{ ordered: false, session, lean: true }
);

// FullText tmp 把成功插入的新数据的 dataId 更新为已初始化
// await MongoDatasetData.updateMany(
// { _id: { $in: result.map((item) => item.dataId) } },
// { $set: { initFullText: true }, $unset: { fullTextToken: 1 } },
// { session }
// );
await MongoDatasetData.updateMany(
{ _id: { $in: result.map((item) => item.dataId) } },
{ $set: { initFullText: true } },
{ session }
);

success += result.length;

console.log('Success:', success);
});

await initData(batchSize);
} catch (error) {
console.log(error, '---');
} catch (error: any) {
console.log(error, '===');
await delay(500);
await initData(batchSize);
}
};

// const batchUpdateFields = async (batchSize = 2000) => {
// // Find documents that still have these fields
// const documents = await MongoDatasetData.find({ initFullText: { $exists: true } }, '_id')
// .limit(batchSize)
// .lean();

// if (documents.length === 0) return;

// // Update in batches
// await MongoDatasetData.updateMany(
// { _id: { $in: documents.map((doc) => doc._id) } },
// {
// $unset: {
// initFullText: 1
// // fullTextToken: 1
// }
// }
// );

// success += documents.length;
// console.log('Delete success:', success);
// await batchUpdateFields(batchSize);
// };
const batchUpdateFields = async (batchSize = 2000) => {
// Update in batches
await MongoDatasetData.updateMany(
{ initFullText: { $exists: true } },
{
$unset: {
initFullText: 1,
fullTextToken: 1
}
}
);
};
4 changes: 2 additions & 2 deletions projects/app/src/service/core/dataset/data/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ export async function insertData2Dataset({
q,
a,
// FullText tmp
fullTextToken: jiebaSplit({ text: qaStr }),
// fullTextToken: jiebaSplit({ text: qaStr }),
chunkIndex,
indexes: indexes?.map((item, i) => ({
...item,
Expand Down Expand Up @@ -243,7 +243,7 @@ export async function updateData2Dataset({
mongoData.q = q || mongoData.q;
mongoData.a = a ?? mongoData.a;
// FullText tmp
mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() });
// mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() });
// @ts-ignore
mongoData.indexes = newIndexes;
await mongoData.save({ session });
Expand Down

0 comments on commit 47f7b1a

Please sign in to comment.