diff --git a/server/src/schema/migrations/1764483051488-OCRBigramsForCJK.ts b/server/src/schema/migrations/1764483051488-OCRBigramsForCJK.ts index 7b659396fe..3f5cb5fa8f 100644 --- a/server/src/schema/migrations/1764483051488-OCRBigramsForCJK.ts +++ b/server/src/schema/migrations/1764483051488-OCRBigramsForCJK.ts @@ -3,21 +3,28 @@ import { tokenizeForSearch } from 'src/utils/database'; export async function up(db: Kysely): Promise { await sql`truncate ${sql.table('ocr_search')}`.execute(db); - const batch = []; - for await (const { assetId, text } of db - .selectFrom('asset_ocr') - .select(['assetId', sql`string_agg(text, ' ')`.as('text')]) - .groupBy('assetId') - .stream()) { - batch.push({ assetId, text: tokenizeForSearch(text) }); - if (batch.length >= 5000) { - await db.insertInto('ocr_search').values(batch).execute(); - batch.length = 0; - } - } - if (batch.length > 0) { - await db.insertInto('ocr_search').values(batch).execute(); + let lastAssetId: string | undefined; + while (true) { + const rows = await db + .selectFrom('asset_ocr') + .select(['assetId', sql`string_agg(text, ' ')`.as('text')]) + .$if(lastAssetId !== undefined, (qb) => qb.where('assetId', '>', lastAssetId)) + .groupBy('assetId') + .orderBy('assetId') + .limit(5000) + .execute(); + + if (rows.length === 0) { + break; + } + + await db + .insertInto('ocr_search') + .values(rows.map(({ assetId, text }) => ({ assetId, text: tokenizeForSearch(text).join(' ') }))) + .execute(); + + lastAssetId = rows.at(-1)!.assetId; } }