From c5b39d857857ca3292f2d8191cb98ce98a815808 Mon Sep 17 00:00:00 2001 From: Derek Au Date: Mon, 11 Dec 2023 11:14:36 -0500 Subject: [PATCH 1/3] enforce keywords array, openai image keywords --- .../ingesters/met/collectionsIngester.test.ts | 80 +++++++ config/site.ts | 2 +- lib/elasticsearch/import.ts | 30 +++ .../extract/openAiExhibitionsExtractor.ts | 2 + .../ingesters/met/collectionsIngester.ts | 14 +- lib/import/ingesters/met/types.ts | 2 +- lib/import/ingesters/rss/util.ts | 2 +- lib/openai/openaiArtworkKeywords.ts | 138 ++++++++++++ lib/schema.ts | 2 +- package-lock.json | 206 +++++++++++++++--- package.json | 2 +- types/document.ts | 4 +- 12 files changed, 445 insertions(+), 39 deletions(-) create mode 100644 __tests__/lib/import/ingesters/met/collectionsIngester.test.ts create mode 100644 lib/openai/openaiArtworkKeywords.ts diff --git a/__tests__/lib/import/ingesters/met/collectionsIngester.test.ts b/__tests__/lib/import/ingesters/met/collectionsIngester.test.ts new file mode 100644 index 0000000..6b0c4d9 --- /dev/null +++ b/__tests__/lib/import/ingesters/met/collectionsIngester.test.ts @@ -0,0 +1,80 @@ +import { ingester } from '@/lib/import/ingesters/met/collectionsIngester'; +import { ArtworkDocument } from '@/types/document'; + +const mockMetDocument = { + 'Object Number': '04.1a–c', + 'Is Highlight': true, + 'Is Timeline Work': true, + 'Is Public Domain': false, + 'Object ID': 35, + 'Gallery Number': 706, + Department: 'The American Wing', + AccessionYear: 1904, + 'Object Name': 'Vase', + Title: 'The Adams Vase', + Culture: 'American', + Period: null, + Dynasty: null, + Reign: null, + Portfolio: null, + 'Constituent ID': 108316253, + 'Artist Role': 'Designer|Manufacturer', + 'Artist Prefix': 'Designed by|Manufactured by', + 'Artist Display Name': 'Paulding Farnham|Tiffany & Co.', + 'Artist Display Bio': '1859–1927|1837–present', + 'Artist Suffix': ' | ', + 'Artist Alpha Sort': 'Farnham, Paulding|Tiffany & Co.', + 'Artist Nationality': 'American| ', + 'Artist Begin Date': '1859 |1837 ', + 'Artist End Date': '1927 |9999 ', + 'Artist Gender': '|', + 'Artist ULAN URL': + 'http://vocab.getty.edu/page/ulan/500336597|http://vocab.getty.edu/page/ulan/500330306', + 'Artist Wikidata URL': + 'https://www.wikidata.org/wiki/Q13476260|https://www.wikidata.org/wiki/Q1066858', + 'Object Date': '1893–95', + 'Object Begin Date': 1893, + 'Object End Date': 1895, + Medium: + 'Gold, amethysts, spessartites, tourmalines, fresh water pearls, quartzes, rock crystal, and enamel', + Dimensions: + 'Overall: 19 7/16 x 13 x 9 1/4 in. (49.4 x 33 x 23.5 cm); 352 oz. 18 dwt. (10977 g) Body: H. 18 7/8 in. (47.9 cm) Cover: 4 1/4 x 4 13/16 in. (10.8 x 12.2 cm); 19 oz. 6 dwt. (600.1 g)', + 'Credit Line': 'Gift of Edward D. Adams, 1904', + 'Geography Type': 'Made in', + City: 'New York', + State: null, + County: null, + Country: 'United States', + Region: null, + Subregion: null, + Locale: null, + Locus: null, + Excavation: null, + River: null, + Classification: null, + 'Rights and Reproduction': null, + 'Link Resource': 'http://www.metmuseum.org/art/collection/search/35', + 'Object Wikidata URL': 'https://www.wikidata.org/wiki/Q83545838', + 'Metadata Date': null, + Repository: 'Metropolitan Museum of Art, New York, NY', + Tags: 'Animals|Garlands|Birds|Men', + 'Tags AAT URL': + 'http://vocab.getty.edu/page/aat/300249525|http://vocab.getty.edu/page/aat/300167386|http://vocab.getty.edu/page/aat/300266506|http://vocab.getty.edu/page/aat/300025928', + 'Tags Wikidata URL': + 'https://www.wikidata.org/wiki/Q729|https://www.wikidata.org/wiki/Q756600|https://www.wikidata.org/wiki/Q5113|https://www.wikidata.org/wiki/Q8441', +}; + +describe('transformDoc', () => { + it('should transform MetDocument into ArtworkDocument', async () => { + const esDoc = await ingester.transform(mockMetDocument) as ArtworkDocument; + + expect(esDoc.source).toBe('The Met'); + expect(esDoc.id).toBe('35'); + expect(esDoc.title).toBe('The Adams Vase'); + expect(esDoc.dimensions).toContain('19 7/16 x 13 x 9 1/4 in.'); + expect(esDoc.highlight).toBe(true); + expect(esDoc.keywords).toBeDefined(); + expect(esDoc.keywords?.length).toBe(4); + expect(esDoc.primaryConstituent?.name).toBe('Paulding Farnham'); + }); +}); diff --git a/config/site.ts b/config/site.ts index 7da6185..98c3a98 100644 --- a/config/site.ts +++ b/config/site.ts @@ -57,7 +57,7 @@ export const siteConfig: SiteConfig = { 'whitney/collectionsIngester', 'met/collectionsIngester', ], - extractors: ['openAiExhibitionsExtractor'], + extractors: [], exhibitionUrls: [ { url: 'https://www.moma.org/calendar/exhibitions/', diff --git a/lib/elasticsearch/import.ts b/lib/elasticsearch/import.ts index 5b0b659..09c8aae 100644 --- a/lib/elasticsearch/import.ts +++ b/lib/elasticsearch/import.ts @@ -1,6 +1,7 @@ import { Client } from '@elastic/elasticsearch'; import * as T from '@elastic/elasticsearch/lib/api/types'; +import { BaseDocument } from '@/types/document'; import { art, events, news, terms } from './indices'; const indices = { @@ -319,3 +320,32 @@ export function getBulkOperationArray( method === 'update' ? { doc, doc_as_upsert: true } : { doc }, ]; } + +/** + * Upsert a document in an index. Doc not guaranteed to contain _id or _index, + * so force those arguments in function signature. + * + * @param client Elasticsearch client. + * @param index Elasticsearch index. + * @param id Elasticsearch document id. + * @param document Elasticsearch document. + */ +export async function upsertDocument( + client: Client, + index: string, + id: string, + document: BaseDocument +) { + const doc = { ...document }; + delete doc._id; + delete doc._index; + await client.update({ + index, + id, + body: { + doc, + doc_as_upsert: true, + }, + refresh: true, + }); +} diff --git a/lib/import/extract/openAiExhibitionsExtractor.ts b/lib/import/extract/openAiExhibitionsExtractor.ts index f757c8e..7e682b0 100644 --- a/lib/import/extract/openAiExhibitionsExtractor.ts +++ b/lib/import/extract/openAiExhibitionsExtractor.ts @@ -1,4 +1,6 @@ /** + * Deprecated, doesn't work that great and relies on old version of OpenAI API. + * * Attempt to extract exhibition information from web pages. * 1. Get markdown from web page * 2. Call OpenAI GPT function to extract JSON exhibition data diff --git a/lib/import/ingesters/met/collectionsIngester.ts b/lib/import/ingesters/met/collectionsIngester.ts index d305c26..dced4d7 100644 --- a/lib/import/ingesters/met/collectionsIngester.ts +++ b/lib/import/ingesters/met/collectionsIngester.ts @@ -25,9 +25,13 @@ const INDEX_NAME = 'art'; const SOURCE_ID = 'met'; const DOC_TYPE = 'artwork'; -function getKeywords(doc: MetDocument): string | undefined { - const keywords = doc['Tags']?.split('|').map((s) => s.trim()); - if (keywords?.length) return keywords.join(', '); +/** + * Elasticsearch keywords are stored as an array of strings + * @param doc Met collections document + * @returns Array of keywords or undefined + */ +function getKeywords(doc: MetDocument): string[] | undefined { + return doc['Tags']?.split('|').map((s) => s.trim()); } async function getConstituents( @@ -200,8 +204,8 @@ async function transformDoc(doc: MetDocument): Promise { dynasty: doc['Dynasty'] || undefined, portfolio: doc['Portfolio'] || undefined, rightsType: doc['Rights and Reproduction'] || undefined, - publicAccess: doc['Is Public Domain']?.toLowerCase() === 'true', - highlight: doc['Is Highlight']?.toLowerCase() === 'true', + publicAccess: getBooleanValue(doc['Is Public Domain']) === true, + highlight: getBooleanValue(doc['Is Highlight']) === true, formattedDate: doc['Object Date'] || undefined, startYear: parseInt(doc['Object Begin Date'], 10) || undefined, endYear: parseInt(doc['Object End Date'], 10) || undefined, diff --git a/lib/import/ingesters/met/types.ts b/lib/import/ingesters/met/types.ts index 4c38c4a..b74e5c9 100644 --- a/lib/import/ingesters/met/types.ts +++ b/lib/import/ingesters/met/types.ts @@ -2,7 +2,7 @@ export interface MetDocument { "Object Number": string; // Accession Number "Is Highlight": string; "Is Timeline Work": string; - "Is Public Domain": string; + "Is Public Domain": string | boolean; "Object ID": string; "Gallery Number": string; Department: string; diff --git a/lib/import/ingesters/rss/util.ts b/lib/import/ingesters/rss/util.ts index fa7f747..cf6f363 100644 --- a/lib/import/ingesters/rss/util.ts +++ b/lib/import/ingesters/rss/util.ts @@ -91,7 +91,7 @@ export function transformRssItem( title, description, searchText, - keywords: item.category?.length ? item.category.join(', ') : undefined, + keywords: item.category?.length ? item.category : undefined, image: { url: thumbnailUrl, thumbnailUrl: thumbnailUrl, diff --git a/lib/openai/openaiArtworkKeywords.ts b/lib/openai/openaiArtworkKeywords.ts new file mode 100644 index 0000000..c6e06cf --- /dev/null +++ b/lib/openai/openaiArtworkKeywords.ts @@ -0,0 +1,138 @@ +/** + * OpenAI Vision API: + * https://platform.openai.com/docs/guides/vision + * + * For possibly mapping keywords to emojis: + * https://unicode.org/Public/emoji/15.1/ + * https://unicode.org/emoji/charts/emoji-list.html + */ +import { loadEnvConfig } from '@next/env'; +import OpenAI from 'openai'; + +import { BaseDocument } from '@/types/document'; +import { getClient } from '@/lib/elasticsearch/client'; +import { upsertDocument } from '@/lib/elasticsearch/import'; +import { getDocument } from '@/lib/elasticsearch/search/document'; + +const OPENAI_VISION_MODEL = 'gpt-4-vision-preview'; + +loadEnvConfig(process.cwd()); + +/** + * Get keywords from an image and update the document + * + * @param index the Elasticsearch index + * @param id the Elasticsearch document id + * @param document the Elasticsearch document + * @param method the method to use to update the keywords, either 'append' or 'replace' + * @returns void + */ +export async function updateDocumentKeywordsFromImage( + index: string, + id: string, + document: BaseDocument, + method: 'append' | 'replace' = 'append' +): Promise { + if (!document?.image?.thumbnailUrl) return; + + const keywords = await getKeywordsFromImage(document.image.thumbnailUrl); + if (keywords && keywords.length > 0) { + if ( + method === 'replace' || + !document.keywords || + document.keywords.length === 0 + ) { + document.keywords = keywords; + } else { + const keywordsSet = new Set([...document.keywords, ...keywords]); + document.keywords = [...keywordsSet]; + } + const client = getClient(); + await upsertDocument(client, index, id, document); + } +} + +/** + * Use GPT to get keywords for an image + * + * @param imageUrl The URL of the image + * @returns A string of keywords + */ +export async function getKeywordsFromImage( + imageUrl: string +): Promise { + if (!imageUrl || !process.env.OPENAI_API_KEY) return; + + const promptText = `As an art historian with deep knowledge of visual art, carefully examine the artwork provided. Generate a comma-separated list of significant single-word keywords that precisely represent the visible subjects or elements within this artwork, while excluding any general or universally applicable art-related terms. Avoid mentioning any elements like frames or palettes unless they are explicitly depicted within the artwork itself. Do not include any terms that might be sensitive or inappropriate. Concentrate on aspects that are unique to this piece of art, steering clear of broad categories such as art types or colors. Your response should consist solely of keywords that directly correspond to the observable content within the artwork, without any additional text or explanations. Please provide only the comma-separated single-word keywords.`; + + const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, + }); + + try { + const params: OpenAI.Chat.ChatCompletionCreateParams = { + messages: [ + { + role: 'user', + content: [ + { + type: 'text', + text: promptText, + }, + { + type: 'image_url', + image_url: { + url: imageUrl, + }, + }, + ], + }, + ], + model: OPENAI_VISION_MODEL, + }; + const chatCompletion: OpenAI.Chat.ChatCompletion = + await openai.chat.completions.create(params); + + if (chatCompletion?.usage) { + console.log(chatCompletion.usage); + } + if (chatCompletion.choices?.[0].message?.content) { + const content = chatCompletion.choices[0].message.content; + if (content) { + const keywords = content + .split(',') + .map((keyword) => keyword.trim().toLowerCase()) + .filter((keyword) => keyword.length > 0); + if (keywords?.length > 0) { + console.log(keywords); + return keywords; + } + } + } + } catch (error) { + if (error.response) { + console.error(error.response); + } else if (error.message) { + console.error(error.message); + } else { + console.error(error); + } + } +} + +/** + * Test the OpenAI API + */ +/* +export async function testOpenAI(id): Promise { + const resp = await getDocument('art', id); + const doc = resp?.data as BaseDocument; + if (!doc?.image?.thumbnailUrl) { + console.log('no image'); + return; + } + await updateDocumentKeywordsFromImage('art', id, doc, 'append'); +} + +testOpenAI('bkm_224999'); +*/ \ No newline at end of file diff --git a/lib/schema.ts b/lib/schema.ts index 541df99..ff3cf52 100644 --- a/lib/schema.ts +++ b/lib/schema.ts @@ -73,7 +73,7 @@ export function getSchemaVisualArtwork(item: ArtworkDocument | undefined) { if (item.creditLine) schema.creditText = item.creditLine; if (item.formattedDate) schema.dateCreated = item.formattedDate; // TODO schema.inLanguage = 'English'; // TODO - if (item.keywords) schema.keywords = item.keywords; + if (item.keywords?.length) schema.keywords = item.keywords.join(', '); return schema; } diff --git a/package-lock.json b/package-lock.json index bf965e6..7edf02d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -64,7 +64,7 @@ "next": "^14.0.1", "next-themes": "npm:@wits/next-themes@^0.2.16", "node-kmeans": "^1.1.9", - "openai": "^3.3.0", + "openai": "^4.20.1", "openseadragon": "^4.1.0", "playwright": "^1.39.0", "react": "^18.2.0", @@ -5012,8 +5012,29 @@ "node_modules/@types/node": { "version": "20.3.3", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.3.3.tgz", - "integrity": "sha512-wheIYdr4NYML61AjC8MKj/2jrR/kDQri/CIpVoZwldwhnIrD/j9jIU5bJ8yBKuB2VhpFV7Ab6G2XkBjv9r9Zzw==", - "devOptional": true + "integrity": "sha512-wheIYdr4NYML61AjC8MKj/2jrR/kDQri/CIpVoZwldwhnIrD/j9jIU5bJ8yBKuB2VhpFV7Ab6G2XkBjv9r9Zzw==" + }, + "node_modules/@types/node-fetch": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.9.tgz", + "integrity": "sha512-bQVlnMLFJ2d35DkPNjEPmd9ueO/rh5EiaZt2bhqiSarPjZIuIV6bPQVqcrEyvNo+AfTrRGVazle1tl597w3gfA==", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.0" + } + }, + "node_modules/@types/node-fetch/node_modules/form-data": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", + "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } }, "node_modules/@types/prop-types": { "version": "15.7.5", @@ -5501,6 +5522,17 @@ "integrity": "sha512-j2afSsaIENvHZN2B8GOpF566vZ5WVk5opAiMTvWgaQT8DkbOqsTfvNAvHoRGU2zzP8cPoqys+xHTRDWW8L+/BA==", "dev": true }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, "node_modules/acorn": { "version": "8.9.0", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.9.0.tgz", @@ -5553,6 +5585,17 @@ "node": ">= 6.0.0" } }, + "node_modules/agentkeepalive": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz", + "integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, "node_modules/ajv": { "version": "6.12.6", "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", @@ -6113,6 +6156,11 @@ "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==" }, + "node_modules/base-64": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/base-64/-/base-64-0.1.0.tgz", + "integrity": "sha512-Y5gU45svrR5tI2Vt/X9GPd3L0HNIKzGu202EjxrXMpuc2V2CiKgemAbUUsqYmZJvPtCXoUKjNZwBJzsNScUbXA==" + }, "node_modules/base64-js": { "version": "1.5.1", "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", @@ -6405,6 +6453,14 @@ "node": ">=10" } }, + "node_modules/charenc": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz", + "integrity": "sha512-yrLQ/yVUFXkzg7EDQsPieE/53+0RlaWTs+wBrvW36cyilJ2SaDWfl4Yj7MtLTXleV9uEKefbAGUPv2/iWSooRA==", + "engines": { + "node": "*" + } + }, "node_modules/cheerio": { "version": "1.0.0-rc.12", "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz", @@ -7002,6 +7058,14 @@ "node": ">= 8" } }, + "node_modules/crypt": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/crypt/-/crypt-0.0.2.tgz", + "integrity": "sha512-mCxBlsHFYh9C+HVpiEacem8FEBnMXgU9gy4zmNC+SXAZNB/1idgp/aulFJ4FgCi7GPEVbfyng092GqL2k2rmow==", + "engines": { + "node": "*" + } + }, "node_modules/css-select": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", @@ -7500,6 +7564,15 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/digest-fetch": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/digest-fetch/-/digest-fetch-1.3.0.tgz", + "integrity": "sha512-CGJuv6iKNM7QyZlM2T3sPAdZWd/p9zQiRNS9G+9COUCwzWFTs0Xp8NF5iePx7wtvhDykReiRRrSeNb4oMmB8lA==", + "dependencies": { + "base-64": "^0.1.0", + "md5": "^2.3.0" + } + }, "node_modules/dir-glob": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz", @@ -8475,6 +8548,14 @@ "node": ">=0.10.0" } }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "engines": { + "node": ">=6" + } + }, "node_modules/execa": { "version": "7.1.1", "resolved": "https://registry.npmjs.org/execa/-/execa-7.1.1.tgz", @@ -8736,6 +8817,31 @@ "node": ">= 0.12" } }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==" + }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, + "node_modules/formdata-node/node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "engines": { + "node": ">= 14" + } + }, "node_modules/fraction.js": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.2.0.tgz", @@ -9358,6 +9464,14 @@ "node": ">=14.18.0" } }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "dependencies": { + "ms": "^2.0.0" + } + }, "node_modules/iconv-lite": { "version": "0.6.3", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", @@ -11950,6 +12064,16 @@ "resolved": "https://registry.npmjs.org/math-expression-evaluator/-/math-expression-evaluator-1.4.0.tgz", "integrity": "sha512-4vRUvPyxdO8cWULGTh9dZWL2tZK6LDBvj+OGHBER7poH9Qdt7kXEoj20wiz4lQUbUXQZFjPbe5mVDo9nutizCw==" }, + "node_modules/md5": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz", + "integrity": "sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==", + "dependencies": { + "charenc": "0.0.2", + "crypt": "0.0.2", + "is-buffer": "~1.1.6" + } + }, "node_modules/merge-stream": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz", @@ -12222,6 +12346,24 @@ "node": ">=v0.6.5" } }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "engines": { + "node": ">=10.5.0" + } + }, "node_modules/node-fetch": { "version": "2.7.0", "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", @@ -12540,33 +12682,30 @@ } }, "node_modules/openai": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/openai/-/openai-3.3.0.tgz", - "integrity": "sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==", - "dependencies": { - "axios": "^0.26.0", - "form-data": "^4.0.0" + "version": "4.20.1", + "resolved": "https://registry.npmjs.org/openai/-/openai-4.20.1.tgz", + "integrity": "sha512-Dd3q8EvINfganZFtg6V36HjrMaihqRgIcKiHua4Nq9aw/PxOP48dhbsk8x5klrxajt5Lpnc1KTOG5i1S6BKAJA==", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "digest-fetch": "^1.3.0", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7", + "web-streams-polyfill": "^3.2.1" + }, + "bin": { + "openai": "bin/cli" } }, - "node_modules/openai/node_modules/axios": { - "version": "0.26.1", - "resolved": "https://registry.npmjs.org/axios/-/axios-0.26.1.tgz", - "integrity": "sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA==", + "node_modules/openai/node_modules/@types/node": { + "version": "18.19.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.3.tgz", + "integrity": "sha512-k5fggr14DwAytoA/t8rPrIz++lXK7/DqckthCmoZOKNsEbJkId4Z//BqgApXBUGrGddrigYa1oqheo/7YmW4rg==", "dependencies": { - "follow-redirects": "^1.14.8" - } - }, - "node_modules/openai/node_modules/form-data": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", - "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", - "dependencies": { - "asynckit": "^0.4.0", - "combined-stream": "^1.0.8", - "mime-types": "^2.1.12" - }, - "engines": { - "node": ">= 6" + "undici-types": "~5.26.4" } }, "node_modules/openseadragon": { @@ -14839,6 +14978,11 @@ "node": ">=14.0" } }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" + }, "node_modules/uniq": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/uniq/-/uniq-1.0.1.tgz", @@ -15037,6 +15181,14 @@ "node": ">=10.13.0" } }, + "node_modules/web-streams-polyfill": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.2.1.tgz", + "integrity": "sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==", + "engines": { + "node": ">= 8" + } + }, "node_modules/webidl-conversions": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", diff --git a/package.json b/package.json index 16d82f6..6a9e689 100644 --- a/package.json +++ b/package.json @@ -70,7 +70,7 @@ "next": "^14.0.1", "next-themes": "npm:@wits/next-themes@^0.2.16", "node-kmeans": "^1.1.9", - "openai": "^3.3.0", + "openai": "^4.20.1", "openseadragon": "^4.1.0", "playwright": "^1.39.0", "react": "^18.2.0", diff --git a/types/document.ts b/types/document.ts index 91cfc9b..2c12d28 100644 --- a/types/document.ts +++ b/types/document.ts @@ -65,8 +65,8 @@ export interface BaseDocument { title?: string; description?: string; searchText?: string; - keywords?: string; - boostedKeywords?: string; + keywords?: string[]; + boostedKeywords?: string[]; primaryConstituent?: DocumentConstituent; image?: DocumentImage; date?: string; From f005430ad9021946b9ca266c080f3692265e155b Mon Sep 17 00:00:00 2001 From: Derek Philip Au <22045002+derekphilipau@users.noreply.github.com> Date: Mon, 11 Dec 2023 13:17:55 -0500 Subject: [PATCH 2/3] isNow filter for current events --- components/search/event-search-checkboxes.tsx | 2 - components/timeline/timeline.tsx | 119 ++++++++++-------- config/site.ts | 2 +- dictionaries/lang/en.json | 2 +- lib/elasticsearch/search/search.ts | 8 +- lib/elasticsearch/search/searchParams.ts | 13 ++ .../search/searchQueryBuilder.ts | 51 ++++++-- 7 files changed, 127 insertions(+), 70 deletions(-) diff --git a/components/search/event-search-checkboxes.tsx b/components/search/event-search-checkboxes.tsx index fc546cd..515658d 100644 --- a/components/search/event-search-checkboxes.tsx +++ b/components/search/event-search-checkboxes.tsx @@ -12,7 +12,6 @@ export function EventSearchCheckboxes({ params }: EventSearchCheckboxesProps) { return (
- {/*
- */}
40 chars, truncate and add ellipsis: const myTitle = @@ -44,8 +33,9 @@ function getBarText(item: BaseDocument | EventDocument) { return `${myTitle} - ${item.source}`; } -function getDomainMin(items: (BaseDocument | EventDocument)[]) { - const min = Math.min( +function getDomainMin(items: EventDocument[]) { + if (!(items?.length > 0)) return new Date().getTime(); + const min = Math.max( ...items .filter((item) => item.date) .map((item) => new Date(item.date || '').getTime()) @@ -54,31 +44,41 @@ function getDomainMin(items: (BaseDocument | EventDocument)[]) { } function getDomainMax(items: (BaseDocument | EventDocument)[]) { + const maxDate = new Date(new Date().setMonth(new Date().getMonth() + 6)) + .toISOString() + .split('T')[0]; + if (!(items?.length > 0)) return new Date().getTime(); const max = Math.max( ...items - .filter((item): item is EventDocument => 'endDate' in item && item.endDate !== undefined) + .filter( + (item): item is EventDocument => + 'endDate' in item && item.endDate !== undefined + ) .map((item) => { - // if item.endDate is greater than 5 years into future, don't use it: - const endDate = new Date(item.endDate || '').getTime(); - return endDate > - new Date( - new Date().setFullYear(new Date().getFullYear() + 5) - ).getTime() - ? new Date(maxDate).getTime() - : endDate; + // if item.endDate is greater than 3 years into future, don't use it: + try { + const endDate = new Date(item.endDate || '').getTime(); + return endDate > + new Date( + new Date().setFullYear(new Date().getFullYear() + 20) + ).getTime() + ? new Date(maxDate).getTime() + : endDate; + } catch (e) { + return 0; + } }) ); return max; } -interface TimelineProps { - items: (BaseDocument | EventDocument)[]; -} - -export function Timeline({ items }: TimelineProps) { - const dict = getDictionary(); - // create a new array of items, sorted by location: - const sortedItems = items.sort((a, b) => { +/** + * Sort items by location and sourceId + * @param items Array of items to sort + * @returns Sorted array of items + */ +function getSortedItems(items: (BaseDocument | EventDocument)[]) { + return [...items].sort((a, b) => { if (a.sourceId && b.sourceId) { const locationA = a.sourceId && sources[a.sourceId]?.location; const locationB = b.sourceId && sources[b.sourceId]?.location; @@ -94,35 +94,44 @@ export function Timeline({ items }: TimelineProps) { } return 0; }); +} - // for each item, if endDate > maxTime, set endDate to maxTime - const maxTime = getDomainMax(sortedItems); - const maxDate = format(new Date(maxTime), 'yyyy-MM-dd'); +function getMinTimeWithinDomain(item: EventDocument, minTime: number) { + if (item.date) return new Date(item.date).getTime(); + return minTime; +} + +function getMaxTimeWithinDomain(item: EventDocument, maxTime: number) { + if (item.endDate && new Date(item.endDate).getTime() < maxTime) { + return new Date(item.endDate).getTime(); + } + return maxTime; +} + +const chartMargin = { top: 40, right: 10, bottom: 10, left: 10 }; +const chartWidth = 1200; +const chartHeight = 600; + +interface TimelineProps { + items: (BaseDocument | EventDocument)[]; +} + +export function Timeline({ items }: TimelineProps) { + const dict = getDictionary(); + const sortedItems = getSortedItems(items); const minTime = getDomainMin(sortedItems); - const minDate = format(new Date(minTime), 'yyyy-MM-dd'); - sortedItems.forEach((item: EventDocument) => { - // if item.endDate is greater than 5 years into future, don't use it: - if (item.endDate) { - const endDate = new Date(item.endDate).getTime(); - if ( - endDate > - new Date(new Date().setFullYear(new Date().getFullYear() + 5)).getTime() - ) { - item.endDate = format(new Date(maxTime), 'yyyy-MM-dd'); - } - } - }); + const maxTime = getDomainMax(sortedItems); const timeScale = scaleLinear({ - domain: [getDomainMin(sortedItems), getDomainMax(sortedItems)], - range: [margin.left, width - margin.right], // Now for horizontal + domain: [minTime, maxTime], + range: [chartMargin.left, chartWidth - chartMargin.right], // Now for horizontal }); const itemScale = scaleBand({ domain: sortedItems .filter((item) => item.title) .map((item) => item.title) as string[], - range: [height - margin.bottom, margin.top], + range: [chartHeight - chartMargin.bottom, chartMargin.top], padding: 0.1, }); @@ -141,12 +150,12 @@ export function Timeline({ items }: TimelineProps) { return ( <>
- + {sortedItems.map((item: EventDocument, i: Key) => { // Swap the usage of scales for x and y - const startX = timeScale(new Date(item.date || minDate).getTime()); - const endX = timeScale(new Date(item.endDate || maxDate).getTime()); + const startX = timeScale(getMinTimeWithinDomain(item, minTime)); + const endX = timeScale(getMaxTimeWithinDomain(item, maxTime)); const barX = Math.min(startX, endX); const barWidth = Math.abs(endX - startX); const barY = itemScale(item.title || '') ?? 0; @@ -212,7 +221,7 @@ export function Timeline({ items }: TimelineProps) { ); })} { const date = new Date(value); @@ -227,8 +236,8 @@ export function Timeline({ items }: TimelineProps) { diff --git a/config/site.ts b/config/site.ts index 98c3a98..fc67119 100644 --- a/config/site.ts +++ b/config/site.ts @@ -226,7 +226,7 @@ export const siteConfig: SiteConfig = { { dict: 'index.events', basePath: 'events', - href: '/events', + href: '/events?isNow=true&f=true', }, ], links: { diff --git a/dictionaries/lang/en.json b/dictionaries/lang/en.json index 083409d..ac7b1ab 100644 --- a/dictionaries/lang/en.json +++ b/dictionaries/lang/en.json @@ -31,7 +31,7 @@ "search.imageUnavailable": "Image Unavailable", "search.noResults": "Sorry, we couldn’t find any results matching your criteria.", "search.didYouMean": "Did you mean:", - "search.isNow": "Now on view", + "search.isNow": "On view", "search.isShowTimeline": "Show timeline", "button.openMenu": "Open Menu", "button.expandFilter": "Expand search filter", diff --git a/lib/elasticsearch/search/search.ts b/lib/elasticsearch/search/search.ts index b49ef6b..5ed3512 100644 --- a/lib/elasticsearch/search/search.ts +++ b/lib/elasticsearch/search/search.ts @@ -12,10 +12,11 @@ import { getClient } from '../client'; import { getElasticsearchIndices, type SearchParams } from './searchParams'; import { addColorQuery, - addQueryAggs, addDefaultQueryBoolDateRange, - addQueryBoolYearRange, + addQueryAggs, + addQueryBoolDateRange, addQueryBoolFilterTerms, + addQueryBoolYearRange, } from './searchQueryBuilder'; import { getTerm, terms } from './terms'; @@ -74,6 +75,9 @@ export async function search( addDefaultQueryBoolDateRange(esQuery, searchParams); // Multi-index search boosts news and events esQuery.indices_boost = [{ news: 1.5 }, { events: 1.5 }, { art: 1 }]; + } else if (searchParams.index === 'events' && searchParams.isNow) { + // Events search has special date range filter + addQueryBoolDateRange(esQuery, new Date(), new Date()); } else { addQueryBoolYearRange(esQuery, searchParams); } diff --git a/lib/elasticsearch/search/searchParams.ts b/lib/elasticsearch/search/searchParams.ts index 0d56ccc..ebbcc5e 100644 --- a/lib/elasticsearch/search/searchParams.ts +++ b/lib/elasticsearch/search/searchParams.ts @@ -119,6 +119,7 @@ export function getSanitizedSearchParams( (typeof params.card === 'string' && params.card) || undefined; sanitizedParams.isShowFilters = getBooleanValue(params.f); sanitizedParams.isShowTimeline = getBooleanValue(params.tl); + sanitizedParams.isNow = getBooleanValue(params.isNow); // date range sanitizedParams.startYear = @@ -233,6 +234,7 @@ export function toURLSearchParams( urlParams.set('onView', searchParams.onView.toString()); if (searchParams.isShowFilters === true) urlParams.set('f', 'true'); if (searchParams.isShowTimeline === true) urlParams.set('tl', 'true'); + if (searchParams.isNow === true) urlParams.set('isNow', 'true'); if (searchParams.layout && searchParams.layout !== LAYOUT_DEFAULT) urlParams.set('layout', searchParams.layout); @@ -399,6 +401,17 @@ export function toggleIsShowTimeline(searchParams: SearchParams): SearchParams { return params; } +/** + * Immutable toggle the isNow flag in search parameters. + * @param searchParams - The current search parameters. + * @returns New search parameters. + */ +export function toggleIsNow(searchParams: SearchParams): SearchParams { + const params = { ...searchParams }; + params.isNow = !params.isNow; + return params; +} + /** * Immutable update of start & end years in search parameters. * Note that years can be negative to indicate B.C.E. diff --git a/lib/elasticsearch/search/searchQueryBuilder.ts b/lib/elasticsearch/search/searchQueryBuilder.ts index 08dbb5e..27249b3 100644 --- a/lib/elasticsearch/search/searchQueryBuilder.ts +++ b/lib/elasticsearch/search/searchQueryBuilder.ts @@ -7,6 +7,39 @@ import { type SearchParams } from './searchParams'; const SEARCH_AGG_SIZE = 20; // 20 results per aggregation +export function addQueryBoolDateRange( + esQuery: any, + fromDate: Date | undefined, + toDate: Date | undefined +) { + if (!fromDate && !toDate) return; + const ranges: T.QueryDslQueryContainer[] = []; + if (fromDate) { + ranges.push({ + range: { + date: { + lte: format(fromDate, 'yyyy-MM-dd'), + }, + }, + }); + } + if (toDate) { + ranges.push({ + range: { + endDate: { + gte: format(toDate, 'yyyy-MM-dd'), + }, + }, + }); + } + if (ranges.length > 0) { + esQuery.query ??= {}; + esQuery.query.bool ??= {}; + esQuery.query.bool.filter ??= []; + esQuery.query.bool.filter.push(...ranges); + } +} + /** * Currently only supports year ranges * @@ -66,8 +99,8 @@ export function addQueryBoolYearRange( /** * For the default date range query, we only want documents (events) that * have already started OR have no start date. - * @param esQuery - * @param searchParams + * @param esQuery + * @param searchParams */ export function addDefaultQueryBoolDateRange( esQuery: any, @@ -87,14 +120,14 @@ export function addDefaultQueryBoolDateRange( bool: { must_not: { exists: { - field: "date" - } - } - } - } + field: 'date', + }, + }, + }, + }, ], - minimum_should_match: 1 - } + minimum_should_match: 1, + }, }; esQuery.query ??= {}; esQuery.query.bool ??= {}; From 976e50c9a5e6334a306b7d54dada9202286bf3d7 Mon Sep 17 00:00:00 2001 From: Derek Au Date: Mon, 11 Dec 2023 21:15:59 -0500 Subject: [PATCH 3/3] comment old openai code --- lib/import/extract/openAiExhibitionsExtractor.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/import/extract/openAiExhibitionsExtractor.ts b/lib/import/extract/openAiExhibitionsExtractor.ts index 7e682b0..3dac0bd 100644 --- a/lib/import/extract/openAiExhibitionsExtractor.ts +++ b/lib/import/extract/openAiExhibitionsExtractor.ts @@ -6,6 +6,8 @@ * 2. Call OpenAI GPT function to extract JSON exhibition data * 3. Parse JSON exhibition data */ +export {} +/** import { loadEnvConfig } from '@next/env'; import { Configuration, CreateChatCompletionRequest, OpenAIApi } from 'openai'; @@ -158,3 +160,4 @@ export const extractor: ElasticsearchExtractor = { return extract(); }, }; +*/ \ No newline at end of file