fix(infra): better search result (#7774)

This commit is contained in:
EYHN 2024-08-07 09:16:43 +00:00
parent 2f0e39b702
commit d968cfe425
No known key found for this signature in database
GPG Key ID: 46C9E26A75AB276C
6 changed files with 85 additions and 74 deletions

View File

@ -244,9 +244,12 @@ export class FullTextInvertedIndex implements InvertedIndex {
// normalize score
const maxScore = submatched.reduce((acc, s) => Math.max(acc, s.score), 0);
const minScore = submatched.reduce((acc, s) => Math.min(acc, s.score), 1);
const minScore = submatched.reduce((acc, s) => Math.min(acc, s.score), 0);
for (const { nid, score, position } of submatched) {
const normalizedScore = (score - minScore) / (maxScore - minScore);
const normalizedScore =
maxScore === minScore
? score
: (score - minScore) / (maxScore - minScore);
const match = matched.get(nid) || {
score: [] as number[],
positions: new Map(),

View File

@ -72,6 +72,7 @@
"jotai-devtools": "^0.10.0",
"jotai-effect": "^1.0.0",
"jotai-scope": "^0.7.0",
"lib0": "^0.2.95",
"lit": "^3.1.3",
"lodash-es": "^4.17.21",
"lottie-react": "^2.4.0",

View File

@ -23,11 +23,9 @@ export function isEmptyUpdate(binary: Uint8Array) {
const logger = new DebugLogger('crawler');
interface IndexerJobPayload {
docId: string;
storageDocId: string;
}
// TODO(@eyhn): simplify this, it's too complex
export class DocsIndexer extends Entity {
private readonly jobQueue: JobQueue<IndexerJobPayload> =
new IndexedDBJobQueue<IndexerJobPayload>(
@ -72,13 +70,11 @@ export class DocsIndexer extends Entity {
return;
}
if (event.clientId === this.workspaceEngine.doc.clientId) {
const docId = normalizeDocId(event.docId);
this.jobQueue
.enqueue([
{
batchKey: docId,
payload: { docId, storageDocId: event.docId },
batchKey: event.docId,
payload: { storageDocId: event.docId },
},
])
.catch(err => {
@ -93,18 +89,17 @@ export class DocsIndexer extends Entity {
return;
}
// jobs should have the same docId, so we just pick the first one
const docId = jobs[0].payload.docId;
// jobs should have the same storage docId, so we just pick the first one
const storageDocId = jobs[0].payload.storageDocId;
const worker = await this.ensureWorker(signal);
const startTime = performance.now();
logger.debug('Start crawling job for docId:', docId);
logger.debug('Start crawling job for storageDocId:', storageDocId);
let workerOutput;
if (docId === this.workspaceId) {
if (storageDocId === this.workspaceId) {
const rootDocBuffer =
await this.workspaceEngine.doc.storage.loadDocFromLocal(
this.workspaceId
@ -151,7 +146,7 @@ export class DocsIndexer extends Entity {
workerOutput = await worker.run({
type: 'doc',
docBuffer,
docId,
storageDocId,
rootDocBuffer,
});
}
@ -190,13 +185,13 @@ export class DocsIndexer extends Entity {
}
await docIndexWriter.commit();
const blockIndexWriter = await this.blockIndex.write();
for (const { blocks } of workerOutput.addedDoc) {
for (const { id, blocks } of workerOutput.addedDoc) {
// delete old blocks
const oldBlocks = await blockIndexWriter.search(
{
type: 'match',
field: 'docId',
match: docId,
match: id,
},
{
pagination: {
@ -217,16 +212,20 @@ export class DocsIndexer extends Entity {
if (workerOutput.reindexDoc) {
await this.jobQueue.enqueue(
workerOutput.reindexDoc.map(({ docId, storageDocId }) => ({
batchKey: docId,
payload: { docId, storageDocId },
workerOutput.reindexDoc.map(({ storageDocId }) => ({
batchKey: storageDocId,
payload: { storageDocId },
}))
);
}
const duration = performance.now() - startTime;
logger.debug(
'Finish crawling job for docId:' + docId + ' in ' + duration + 'ms '
'Finish crawling job for storageDocId:' +
storageDocId +
' in ' +
duration +
'ms '
);
}
@ -236,7 +235,7 @@ export class DocsIndexer extends Entity {
.enqueue([
{
batchKey: this.workspaceId,
payload: { docId: this.workspaceId, storageDocId: this.workspaceId },
payload: { storageDocId: this.workspaceId },
},
])
.catch(err => {
@ -255,47 +254,3 @@ export class DocsIndexer extends Entity {
this.runner.stop();
}
}
function normalizeDocId(raw: string) {
enum DocVariant {
Workspace = 'workspace',
Page = 'page',
Space = 'space',
Settings = 'settings',
Unknown = 'unknown',
}
try {
if (!raw.length) {
throw new Error('Invalid Empty Doc ID');
}
let parts = raw.split(':');
if (parts.length > 3) {
// special adapt case `wsId:space:page:pageId`
if (parts[1] === DocVariant.Space && parts[2] === DocVariant.Page) {
parts = [parts[0], DocVariant.Space, parts[3]];
} else {
throw new Error(`Invalid format of Doc ID: ${raw}`);
}
} else if (parts.length === 2) {
// `${variant}:${guid}`
throw new Error('not supported');
} else if (parts.length === 1) {
// ${ws} or ${pageId}
parts = ['', DocVariant.Unknown, parts[0]];
}
const docId = parts.at(2);
if (!docId) {
throw new Error('ID is required');
}
return docId;
} catch (err) {
logger.error('Error on normalize docId ' + raw, err);
return raw;
}
}

View File

@ -1,6 +1,8 @@
import type { AffineTextAttributes } from '@blocksuite/blocks';
import type { DeltaInsert } from '@blocksuite/inline';
import { Document } from '@toeverything/infra';
import { toHexString } from 'lib0/buffer.js';
import { digest as lib0Digest } from 'lib0/hash/sha256';
import { difference } from 'lodash-es';
import {
applyUpdate,
@ -18,24 +20,66 @@ import type {
WorkerOutput,
} from './types';
function crawlingDocData({
let cachedRootDoc: { doc: YDoc; hash: string } | null = null;
async function digest(data: Uint8Array) {
if (
globalThis.crypto &&
globalThis.crypto.subtle &&
typeof globalThis.crypto.subtle.digest === 'function'
) {
return new Uint8Array(
await globalThis.crypto.subtle.digest('SHA-256', data)
);
}
return lib0Digest(data);
}
async function crawlingDocData({
docBuffer,
docId,
storageDocId,
rootDocBuffer,
}: WorkerInput & { type: 'doc' }): WorkerOutput {
const yRootDoc = new YDoc();
applyUpdate(yRootDoc, rootDocBuffer);
}: WorkerInput & { type: 'doc' }): Promise<WorkerOutput> {
if (isEmptyUpdate(rootDocBuffer)) {
console.warn('[worker]: Empty root doc buffer');
return {};
}
const rootDocBufferHash = toHexString(await digest(rootDocBuffer));
let yRootDoc;
if (cachedRootDoc && cachedRootDoc.hash === rootDocBufferHash) {
yRootDoc = cachedRootDoc.doc;
} else {
yRootDoc = new YDoc();
applyUpdate(yRootDoc, rootDocBuffer);
cachedRootDoc = { doc: yRootDoc, hash: rootDocBufferHash };
}
let docId = null;
for (const [id, subdoc] of yRootDoc.getMap('spaces')) {
if (subdoc instanceof YDoc && storageDocId === subdoc.guid) {
docId = id;
break;
}
}
if (docId === null) {
return {};
}
const ydoc = new YDoc();
applyUpdate(ydoc, docBuffer);
if (!isEmptyUpdate(docBuffer)) {
applyUpdate(ydoc, docBuffer);
}
let docExists: boolean | null = null;
(
yRootDoc.getMap('meta').get('pages') as YArray<YMap<any>> | undefined
)?.forEach(page => {
if (page.get('id') === docId) {
if (page.get('id') === storageDocId) {
docExists = !(page.get('trash') ?? false);
}
});
@ -283,7 +327,7 @@ function crawlingRootDocData({
};
}
globalThis.onmessage = (event: MessageEvent<WorkerIngoingMessage>) => {
globalThis.onmessage = async (event: MessageEvent<WorkerIngoingMessage>) => {
const message = event.data;
if (message.type === 'init') {
postMessage({ type: 'init', msgId: message.msgId });
@ -296,7 +340,7 @@ globalThis.onmessage = (event: MessageEvent<WorkerIngoingMessage>) => {
if (input.type === 'rootDoc') {
data = crawlingRootDocData(input);
} else {
data = crawlingDocData(input);
data = await crawlingDocData(input);
}
postMessage({ type: 'done', msgId: message.msgId, output: data });
@ -311,3 +355,10 @@ globalThis.onmessage = (event: MessageEvent<WorkerIngoingMessage>) => {
};
declare function postMessage(message: WorkerOutgoingMessage): void;
function isEmptyUpdate(binary: Uint8Array) {
return (
binary.byteLength === 0 ||
(binary.byteLength === 2 && binary[0] === 0 && binary[1] === 0)
);
}

View File

@ -34,7 +34,7 @@ export type WorkerInput =
}
| {
type: 'doc';
docId: string;
storageDocId: string;
rootDocBuffer: Uint8Array;
docBuffer: Uint8Array;
};

View File

@ -452,6 +452,7 @@ __metadata:
jotai-devtools: "npm:^0.10.0"
jotai-effect: "npm:^1.0.0"
jotai-scope: "npm:^0.7.0"
lib0: "npm:^0.2.95"
lit: "npm:^3.1.3"
lodash-es: "npm:^4.17.21"
lottie-react: "npm:^2.4.0"