Skip to content
This repository was archived by the owner on Sep 29, 2025. It is now read-only.

Commit f0c4cf2

Browse files
authored
(DOCSP-31304): Implement 'all' command (#40)
* Implement 'all' command * Add tests * Fix slight bug and add more logging * Fix bug betterlike * Address comments
1 parent 39c0f97 commit f0c4cf2

8 files changed

Lines changed: 427 additions & 89 deletions

File tree

chat-core/src/DatabaseConnection.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ export const makeDatabaseConnection = async ({
5656
},
5757

5858
async updateEmbeddedContent({ page, embeddedContent }) {
59+
assert(embeddedContent.length !== 0);
5960
embeddedContent.forEach((embeddedContent) => {
6061
assert(
6162
embeddedContent.sourceName === page.sourceName &&

chat-core/src/Page.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,5 +49,8 @@ export type PageStore = {
4949
sources?: string[];
5050
}): Promise<PersistedPage[]>;
5151

52+
/**
53+
Updates or adds the given pages in the store.
54+
*/
5255
updatePages(pages: PersistedPage[]): Promise<void>;
5356
};

ingest/src/IngestMetaStore.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import { MongoClient } from "mongodb";
2+
3+
/**
4+
The ingest meta has information about ingest runs so that the script can
5+
resume from a known successful run date.
6+
7+
If the 'since' date given to the embed command is too late, pages that were
8+
updated during a failed run will not be picked up.
9+
10+
If too early, more pages and embeddings will be checked than necessary. The
11+
embed command will not unnecessarily create new embeddings for page updates
12+
that it has already created embeddings for, but it would still be wasteful to
13+
have to check potentially all pages and embeddings when the date is early
14+
enough.
15+
*/
16+
export type IngestMetaStore = {
17+
/**
18+
The ID of the specific metadata document this store is associated with.
19+
Generally there should be only one document per ingest_meta collection per
20+
database.
21+
*/
22+
readonly entryId: string;
23+
24+
/**
25+
Returns the last successful run date for the store's entry.
26+
*/
27+
loadLastSuccessfulRunDate(): Promise<Date | null>;
28+
29+
/**
30+
Sets the store's entry to the current date.
31+
*/
32+
updateLastSuccessfulRunDate(): Promise<void>;
33+
34+
/**
35+
Closes the connection. Must be called when done.
36+
*/
37+
close(): Promise<void>;
38+
};
39+
40+
export type IngestMetaEntry = {
41+
_id: string;
42+
lastIngestDate: Date;
43+
};
44+
45+
/**
46+
Creates a connection to ingest meta collection.
47+
*/
48+
export const makeIngestMetaStore = async ({
49+
connectionUri,
50+
databaseName,
51+
entryId,
52+
}: {
53+
connectionUri: string;
54+
databaseName: string;
55+
entryId: string;
56+
}): Promise<IngestMetaStore> => {
57+
const client = await MongoClient.connect(connectionUri);
58+
const collection = client
59+
.db(databaseName)
60+
.collection<IngestMetaEntry>("ingest_meta");
61+
return {
62+
entryId,
63+
64+
async close() {
65+
await client.close();
66+
},
67+
async loadLastSuccessfulRunDate() {
68+
return (
69+
(await collection.findOne({ _id: entryId }))?.lastIngestDate ?? null
70+
);
71+
},
72+
async updateLastSuccessfulRunDate() {
73+
await collection.updateOne(
74+
{
75+
_id: entryId,
76+
},
77+
{
78+
$set: {
79+
_id: entryId,
80+
lastIngestDate: new Date(),
81+
},
82+
},
83+
{ upsert: true }
84+
);
85+
},
86+
};
87+
};

ingest/src/commands/all.test.ts

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import { PageStore, EmbeddedContentStore, assertEnvVars } from "chat-core";
2+
import { MongoClient } from "mongodb";
3+
import { INGEST_ENV_VARS } from "../IngestEnvVars";
4+
import { doAllCommand } from "./all";
5+
import { makeIngestMetaStore } from "../IngestMetaStore";
6+
7+
import "dotenv/config";
8+
9+
jest.setTimeout(1000000);
10+
11+
describe("allCommand", () => {
12+
const { MONGODB_CONNECTION_URI: connectionUri } =
13+
assertEnvVars(INGEST_ENV_VARS);
14+
15+
const mockEmbeddedContentStore: EmbeddedContentStore = {
16+
async deleteEmbeddedContent() {
17+
return;
18+
},
19+
async findNearestNeighbors() {
20+
return [];
21+
},
22+
async loadEmbeddedContent() {
23+
return [];
24+
},
25+
async updateEmbeddedContent() {
26+
return;
27+
},
28+
};
29+
const mockPageStore: PageStore = {
30+
async loadPages() {
31+
return [];
32+
},
33+
async updatePages() {
34+
return;
35+
},
36+
};
37+
38+
let databaseName: string;
39+
40+
beforeEach(async () => {
41+
databaseName = `test-all-command-${Date.now()}-${Math.floor(
42+
Math.random() * 10000000
43+
)}`;
44+
});
45+
46+
afterEach(async () => {
47+
const client = await MongoClient.connect(connectionUri);
48+
try {
49+
const db = client.db(databaseName);
50+
await db.dropDatabase();
51+
} finally {
52+
await client.close();
53+
}
54+
});
55+
56+
it("updates the metadata with the last successful timestamp", async () => {
57+
const ingestMetaStore = await makeIngestMetaStore({
58+
connectionUri,
59+
databaseName,
60+
entryId: "all",
61+
});
62+
try {
63+
let lastSuccessfulRunDate =
64+
await ingestMetaStore.loadLastSuccessfulRunDate();
65+
expect(lastSuccessfulRunDate).toBeNull();
66+
await doAllCommand({
67+
pageStore: mockPageStore,
68+
embeddedContentStore: mockEmbeddedContentStore,
69+
connectionUri,
70+
databaseName,
71+
async doPagesCommand() {
72+
return;
73+
},
74+
});
75+
lastSuccessfulRunDate = await ingestMetaStore.loadLastSuccessfulRunDate();
76+
expect(lastSuccessfulRunDate?.getTime()).toBeGreaterThan(
77+
Date.now() - 5000
78+
);
79+
expect(lastSuccessfulRunDate?.getTime()).toBeLessThanOrEqual(Date.now());
80+
} finally {
81+
await ingestMetaStore.close();
82+
}
83+
});
84+
85+
it("does not update the metadata with the last successful timestamp on failure", async () => {
86+
const ingestMetaStore = await makeIngestMetaStore({
87+
connectionUri,
88+
databaseName,
89+
entryId: "all",
90+
});
91+
try {
92+
let lastSuccessfulRunDate =
93+
await ingestMetaStore.loadLastSuccessfulRunDate();
94+
expect(lastSuccessfulRunDate).toBeNull();
95+
try {
96+
await doAllCommand({
97+
pageStore: mockPageStore,
98+
embeddedContentStore: mockEmbeddedContentStore,
99+
connectionUri,
100+
databaseName,
101+
async doPagesCommand() {
102+
// Sudden failure!
103+
throw new Error("Fail!");
104+
},
105+
});
106+
} catch (e: unknown) {
107+
expect((e as { message: string }).message).toBe("Fail!");
108+
}
109+
lastSuccessfulRunDate = await ingestMetaStore.loadLastSuccessfulRunDate();
110+
// Not updated because run failed
111+
expect(lastSuccessfulRunDate).toBeNull();
112+
} finally {
113+
await ingestMetaStore.close();
114+
}
115+
});
116+
});

ingest/src/commands/all.ts

Lines changed: 76 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,84 @@
11
import { CommandModule } from "yargs";
2+
import { doPagesCommand as officialDoPages } from "./pages";
3+
import { doEmbedCommand } from "./embed";
4+
import {
5+
makeDatabaseConnection,
6+
assertEnvVars,
7+
EmbeddedContentStore,
8+
PageStore,
9+
logger,
10+
} from "chat-core";
11+
import { INGEST_ENV_VARS } from "../IngestEnvVars";
12+
import { makeIngestMetaStore } from "../IngestMetaStore";
213

3-
const commandModule: CommandModule = {
14+
const commandModule: CommandModule<unknown, unknown> = {
415
command: "all",
516
async handler() {
6-
console.log("Hello all!");
7-
console.log("The time is:", new Date().toISOString());
17+
const { MONGODB_CONNECTION_URI, MONGODB_DATABASE_NAME } =
18+
assertEnvVars(INGEST_ENV_VARS);
19+
20+
const store = await makeDatabaseConnection({
21+
connectionUri: MONGODB_CONNECTION_URI,
22+
databaseName: MONGODB_DATABASE_NAME,
23+
});
24+
25+
try {
26+
await doAllCommand({
27+
pageStore: store,
28+
embeddedContentStore: store,
29+
connectionUri: MONGODB_CONNECTION_URI,
30+
databaseName: MONGODB_DATABASE_NAME,
31+
});
32+
} finally {
33+
await store.close();
34+
}
835
},
9-
describe: "Testing command",
36+
describe: "Run 'pages' and 'embed' since last successful run",
1037
};
1138

1239
export default commandModule;
40+
41+
export const doAllCommand = async ({
42+
pageStore,
43+
embeddedContentStore,
44+
connectionUri,
45+
databaseName,
46+
doPagesCommand = officialDoPages,
47+
}: {
48+
pageStore: PageStore;
49+
embeddedContentStore: EmbeddedContentStore;
50+
connectionUri: string;
51+
databaseName: string;
52+
53+
// Mockable for unit test - otherwise will actually load pages from all
54+
// sources, waste time
55+
doPagesCommand?: typeof officialDoPages;
56+
}) => {
57+
const ingestMetaStore = await makeIngestMetaStore({
58+
connectionUri,
59+
databaseName,
60+
entryId: "all",
61+
});
62+
63+
try {
64+
const lastSuccessfulRunDate =
65+
await ingestMetaStore.loadLastSuccessfulRunDate();
66+
67+
logger.info(`Last successful run date: ${lastSuccessfulRunDate}`);
68+
69+
await doPagesCommand({
70+
store: pageStore,
71+
});
72+
73+
await doEmbedCommand({
74+
since: lastSuccessfulRunDate ?? new Date("2023-01-01"),
75+
pageStore,
76+
embeddedContentStore,
77+
});
78+
79+
logger.info(`Updating last successful run date`);
80+
await ingestMetaStore.updateLastSuccessfulRunDate();
81+
} finally {
82+
await ingestMetaStore.close();
83+
}
84+
};

0 commit comments

Comments
 (0)