diff --git a/packages/common/infra/src/sync/indexer/impl/indexeddb/inverted-index.ts b/packages/common/infra/src/sync/indexer/impl/indexeddb/inverted-index.ts index 76f85d4d6f..f21e1b793b 100644 --- a/packages/common/infra/src/sync/indexer/impl/indexeddb/inverted-index.ts +++ b/packages/common/infra/src/sync/indexer/impl/indexeddb/inverted-index.ts @@ -244,9 +244,12 @@ export class FullTextInvertedIndex implements InvertedIndex { // normalize score const maxScore = submatched.reduce((acc, s) => Math.max(acc, s.score), 0); - const minScore = submatched.reduce((acc, s) => Math.min(acc, s.score), 1); + const minScore = submatched.reduce((acc, s) => Math.min(acc, s.score), 0); for (const { nid, score, position } of submatched) { - const normalizedScore = (score - minScore) / (maxScore - minScore); + const normalizedScore = + maxScore === minScore + ? score + : (score - minScore) / (maxScore - minScore); const match = matched.get(nid) || { score: [] as number[], positions: new Map(), diff --git a/packages/frontend/core/package.json b/packages/frontend/core/package.json index 07aa356602..81332c7457 100644 --- a/packages/frontend/core/package.json +++ b/packages/frontend/core/package.json @@ -72,6 +72,7 @@ "jotai-devtools": "^0.10.0", "jotai-effect": "^1.0.0", "jotai-scope": "^0.7.0", + "lib0": "^0.2.95", "lit": "^3.1.3", "lodash-es": "^4.17.21", "lottie-react": "^2.4.0", diff --git a/packages/frontend/core/src/modules/docs-search/entities/docs-indexer.ts b/packages/frontend/core/src/modules/docs-search/entities/docs-indexer.ts index ad849f1f16..ccfa246aea 100644 --- a/packages/frontend/core/src/modules/docs-search/entities/docs-indexer.ts +++ b/packages/frontend/core/src/modules/docs-search/entities/docs-indexer.ts @@ -23,11 +23,9 @@ export function isEmptyUpdate(binary: Uint8Array) { const logger = new DebugLogger('crawler'); interface IndexerJobPayload { - docId: string; storageDocId: string; } -// TODO(@eyhn): simplify this, it's too complex export class DocsIndexer extends Entity { private readonly jobQueue: JobQueue = new IndexedDBJobQueue( @@ -72,13 +70,11 @@ export class DocsIndexer extends Entity { return; } if (event.clientId === this.workspaceEngine.doc.clientId) { - const docId = normalizeDocId(event.docId); - this.jobQueue .enqueue([ { - batchKey: docId, - payload: { docId, storageDocId: event.docId }, + batchKey: event.docId, + payload: { storageDocId: event.docId }, }, ]) .catch(err => { @@ -93,18 +89,17 @@ export class DocsIndexer extends Entity { return; } - // jobs should have the same docId, so we just pick the first one - const docId = jobs[0].payload.docId; + // jobs should have the same storage docId, so we just pick the first one const storageDocId = jobs[0].payload.storageDocId; const worker = await this.ensureWorker(signal); const startTime = performance.now(); - logger.debug('Start crawling job for docId:', docId); + logger.debug('Start crawling job for storageDocId:', storageDocId); let workerOutput; - if (docId === this.workspaceId) { + if (storageDocId === this.workspaceId) { const rootDocBuffer = await this.workspaceEngine.doc.storage.loadDocFromLocal( this.workspaceId @@ -151,7 +146,7 @@ export class DocsIndexer extends Entity { workerOutput = await worker.run({ type: 'doc', docBuffer, - docId, + storageDocId, rootDocBuffer, }); } @@ -190,13 +185,13 @@ export class DocsIndexer extends Entity { } await docIndexWriter.commit(); const blockIndexWriter = await this.blockIndex.write(); - for (const { blocks } of workerOutput.addedDoc) { + for (const { id, blocks } of workerOutput.addedDoc) { // delete old blocks const oldBlocks = await blockIndexWriter.search( { type: 'match', field: 'docId', - match: docId, + match: id, }, { pagination: { @@ -217,16 +212,20 @@ export class DocsIndexer extends Entity { if (workerOutput.reindexDoc) { await this.jobQueue.enqueue( - workerOutput.reindexDoc.map(({ docId, storageDocId }) => ({ - batchKey: docId, - payload: { docId, storageDocId }, + workerOutput.reindexDoc.map(({ storageDocId }) => ({ + batchKey: storageDocId, + payload: { storageDocId }, })) ); } const duration = performance.now() - startTime; logger.debug( - 'Finish crawling job for docId:' + docId + ' in ' + duration + 'ms ' + 'Finish crawling job for storageDocId:' + + storageDocId + + ' in ' + + duration + + 'ms ' ); } @@ -236,7 +235,7 @@ export class DocsIndexer extends Entity { .enqueue([ { batchKey: this.workspaceId, - payload: { docId: this.workspaceId, storageDocId: this.workspaceId }, + payload: { storageDocId: this.workspaceId }, }, ]) .catch(err => { @@ -255,47 +254,3 @@ export class DocsIndexer extends Entity { this.runner.stop(); } } - -function normalizeDocId(raw: string) { - enum DocVariant { - Workspace = 'workspace', - Page = 'page', - Space = 'space', - Settings = 'settings', - Unknown = 'unknown', - } - - try { - if (!raw.length) { - throw new Error('Invalid Empty Doc ID'); - } - - let parts = raw.split(':'); - - if (parts.length > 3) { - // special adapt case `wsId:space:page:pageId` - if (parts[1] === DocVariant.Space && parts[2] === DocVariant.Page) { - parts = [parts[0], DocVariant.Space, parts[3]]; - } else { - throw new Error(`Invalid format of Doc ID: ${raw}`); - } - } else if (parts.length === 2) { - // `${variant}:${guid}` - throw new Error('not supported'); - } else if (parts.length === 1) { - // ${ws} or ${pageId} - parts = ['', DocVariant.Unknown, parts[0]]; - } - - const docId = parts.at(2); - - if (!docId) { - throw new Error('ID is required'); - } - - return docId; - } catch (err) { - logger.error('Error on normalize docId ' + raw, err); - return raw; - } -} diff --git a/packages/frontend/core/src/modules/docs-search/worker/in-worker.ts b/packages/frontend/core/src/modules/docs-search/worker/in-worker.ts index cdb5c1ad60..1fb6c5a0ad 100644 --- a/packages/frontend/core/src/modules/docs-search/worker/in-worker.ts +++ b/packages/frontend/core/src/modules/docs-search/worker/in-worker.ts @@ -1,6 +1,8 @@ import type { AffineTextAttributes } from '@blocksuite/blocks'; import type { DeltaInsert } from '@blocksuite/inline'; import { Document } from '@toeverything/infra'; +import { toHexString } from 'lib0/buffer.js'; +import { digest as lib0Digest } from 'lib0/hash/sha256'; import { difference } from 'lodash-es'; import { applyUpdate, @@ -18,24 +20,66 @@ import type { WorkerOutput, } from './types'; -function crawlingDocData({ +let cachedRootDoc: { doc: YDoc; hash: string } | null = null; + +async function digest(data: Uint8Array) { + if ( + globalThis.crypto && + globalThis.crypto.subtle && + typeof globalThis.crypto.subtle.digest === 'function' + ) { + return new Uint8Array( + await globalThis.crypto.subtle.digest('SHA-256', data) + ); + } + return lib0Digest(data); +} + +async function crawlingDocData({ docBuffer, - docId, + storageDocId, rootDocBuffer, -}: WorkerInput & { type: 'doc' }): WorkerOutput { - const yRootDoc = new YDoc(); - applyUpdate(yRootDoc, rootDocBuffer); +}: WorkerInput & { type: 'doc' }): Promise { + if (isEmptyUpdate(rootDocBuffer)) { + console.warn('[worker]: Empty root doc buffer'); + return {}; + } + + const rootDocBufferHash = toHexString(await digest(rootDocBuffer)); + + let yRootDoc; + if (cachedRootDoc && cachedRootDoc.hash === rootDocBufferHash) { + yRootDoc = cachedRootDoc.doc; + } else { + yRootDoc = new YDoc(); + applyUpdate(yRootDoc, rootDocBuffer); + cachedRootDoc = { doc: yRootDoc, hash: rootDocBufferHash }; + } + + let docId = null; + for (const [id, subdoc] of yRootDoc.getMap('spaces')) { + if (subdoc instanceof YDoc && storageDocId === subdoc.guid) { + docId = id; + break; + } + } + + if (docId === null) { + return {}; + } const ydoc = new YDoc(); - applyUpdate(ydoc, docBuffer); + if (!isEmptyUpdate(docBuffer)) { + applyUpdate(ydoc, docBuffer); + } let docExists: boolean | null = null; ( yRootDoc.getMap('meta').get('pages') as YArray> | undefined )?.forEach(page => { - if (page.get('id') === docId) { + if (page.get('id') === storageDocId) { docExists = !(page.get('trash') ?? false); } }); @@ -283,7 +327,7 @@ function crawlingRootDocData({ }; } -globalThis.onmessage = (event: MessageEvent) => { +globalThis.onmessage = async (event: MessageEvent) => { const message = event.data; if (message.type === 'init') { postMessage({ type: 'init', msgId: message.msgId }); @@ -296,7 +340,7 @@ globalThis.onmessage = (event: MessageEvent) => { if (input.type === 'rootDoc') { data = crawlingRootDocData(input); } else { - data = crawlingDocData(input); + data = await crawlingDocData(input); } postMessage({ type: 'done', msgId: message.msgId, output: data }); @@ -311,3 +355,10 @@ globalThis.onmessage = (event: MessageEvent) => { }; declare function postMessage(message: WorkerOutgoingMessage): void; + +function isEmptyUpdate(binary: Uint8Array) { + return ( + binary.byteLength === 0 || + (binary.byteLength === 2 && binary[0] === 0 && binary[1] === 0) + ); +} diff --git a/packages/frontend/core/src/modules/docs-search/worker/types.ts b/packages/frontend/core/src/modules/docs-search/worker/types.ts index d5ff7f10dd..9240ac63cc 100644 --- a/packages/frontend/core/src/modules/docs-search/worker/types.ts +++ b/packages/frontend/core/src/modules/docs-search/worker/types.ts @@ -34,7 +34,7 @@ export type WorkerInput = } | { type: 'doc'; - docId: string; + storageDocId: string; rootDocBuffer: Uint8Array; docBuffer: Uint8Array; }; diff --git a/yarn.lock b/yarn.lock index 6a567ca5ee..ab6877ae2d 100644 --- a/yarn.lock +++ b/yarn.lock @@ -452,6 +452,7 @@ __metadata: jotai-devtools: "npm:^0.10.0" jotai-effect: "npm:^1.0.0" jotai-scope: "npm:^0.7.0" + lib0: "npm:^0.2.95" lit: "npm:^3.1.3" lodash-es: "npm:^4.17.21" lottie-react: "npm:^2.4.0"