Merge pull request #36 from derekphilipau/dev

Dev
derekphilipau · Dec 29, 2023 · 55eef36 · 55eef36
2 parents 0f86ed4 + 19ebdac
commit 55eef36
Show file tree

Hide file tree

Showing 11 changed files with 120 additions and 80 deletions.
diff --git a/.env.local.sample b/.env.local.sample
@@ -8,8 +8,12 @@ ELASTICSEARCH_PORT=9200
 ELASTICSEARCH_CA_FILE=./secrets/es01.crt
 ELASTICSEARCH_API_KEY=MVE2aWxZUIJBWkNOUzYwU1ZKbUg6dEllY4JjQkVTZ3lFWlU3RRdLUm5mQQ==
 ELASTICSEARCH_BULK_LIMIT=2000
-API_SECRET=dfJtqJDG9VwN69edUU283qnD
-NEXT_PUBLIC_IMAGE_DOMAIN=rx3rxq8hyni2c.cloudfront.net
+CRON_SECRET=supersecretrandomstringover16characters
+AWS_ACCESS_KEY_ID=myaccesskey
+AWS_SECRET_ACCESS_KEY=mysecretkey
+AWS_REGION=myawsregion
+AWS_BUCKET_NAME=mybucketname
+NEXT_PUBLIC_IMAGE_DOMAIN=mydomain.cloudfront.net
 PROCESS_IMAGES=true
 FORMSPREE_FORM_ID=rwejcdbw
 OPENAI_API_KEY=sk-231rZaTl2w4MuRPOrsT1T9BlckFJes7O2D1RIOqEkvV2SEAZ

diff --git a/.github/workflows/playwright-tests.yml b/.github/workflows/playwright-tests.yml
@@ -21,7 +21,6 @@ jobs:
           BASE_URL: ${{ github.event.deployment_status.environment_url }}
           AWS_REGION: ${{ secrets.AWS_REGION }}
           AWS_BUCKET_NAME: ${{ secrets.AWS_BUCKET_NAME }}
-          API_SECRET: ${{ secrets.API_SECRET }}
           ELASTICSEARCH_CLOUD_ID: ${{ secrets.ELASTICSEARCH_CLOUD_ID }}
           ELASTICSEARCH_CLOUD_USERNAME: ${{ secrets.ELASTICSEARCH_CLOUD_USERNAME }}
           ELASTICSEARCH_CLOUD_PASSWORD: ${{ secrets.ELASTICSEARCH_CLOUD_PASSWORD }}

diff --git a/__tests__/lib/import/ingesters/rss/util.test.ts b/__tests__/lib/import/ingesters/rss/util.test.ts
@@ -1,28 +1,50 @@
 import fs from 'fs';
 import path from 'path';
-import { parseStringPromise } from 'xml2js';
 
 import {
   getRssItemId,
   getRssItemImageUrl,
   parseDate,
+  parseXml,
   transformRssItem,
 } from '@/lib/import/ingesters/rss/util';
 import { stripHtmlTags } from '@/lib/various';
 
-describe('rss import function', () => {
+describe('transformRssItem', () => {
   // Read the content of the RSS feed from the static file
   const testContent = fs.readFileSync(
     path.join(__dirname, './artnews.rss.xml'),
     'utf8'
   );
 
   let items: any[] = [];
+  let nytItemXml: any = {};
 
   // Use beforeAll to process async operations before tests
   beforeAll(async () => {
-    const jsonObj = await parseStringPromise(testContent);
+    const jsonObj = await parseXml(testContent);
     items = jsonObj.rss.channel[0].item;
+
+    const nytItemRss = `
+    <item>
+      <title>How 1993 — and Two Watershed Shows — Help Make Sense of 2023</title>
+      <link>https://www.nytimes.com/2023/12/27/arts/design/1993-whitney-biennial-theater-of-refusal-art.html</link>
+      <guid isPermaLink="true">https://www.nytimes.com/2023/12/27/arts/design/1993-whitney-biennial-theater-of-refusal-art.html</guid>
+      <atom:link href="https://www.nytimes.com/2023/12/27/arts/design/1993-whitney-biennial-theater-of-refusal-art.html" rel="standout"/>
+      <description>A blue-chip gallery asks, does the infamous Whitney Biennial or “The Theater of Refusal” measure up 30 years later, when artists of color have moved to the mainstream?</description>
+      <dc:creator>Aruna D’Souza</dc:creator>
+      <pubDate>Wed, 27 Dec 2023 16:51:11 +0000</pubDate>
+      <category domain="http://www.nytimes.com/namespaces/keywords/des">Art</category>
+      <category domain="http://www.nytimes.com/namespaces/keywords/des">Photography</category>
+      <category domain="http://www.nytimes.com/namespaces/keywords/des">Race and Ethnicity</category>
+      <category domain="http://www.nytimes.com/namespaces/keywords/nyt_org">Hauser & Wirth</category>
+      <category domain="http://www.nytimes.com/namespaces/keywords/nyt_per">Fowle, Kate</category>
+      <category domain="http://www.nytimes.com/namespaces/keywords/nyt_geo">Los Angeles (Calif)</category>
+      <category domain="http://www.nytimes.com/namespaces/keywords/nyt_per">Gaines, Charles</category>
+      <category domain="http://www.nytimes.com/namespaces/keywords/nyt_org">Whitney Museum of American Art</category>
+      <category domain="http://www.nytimes.com/namespaces/keywords/nyt_org">University of California</category>
+    </item>`;
+    nytItemXml = (await parseXml(nytItemRss))?.item;
   });
 
   it('should return image for content embedded', () => {
@@ -34,7 +56,6 @@ describe('rss import function', () => {
 
   it("should return the rss item's id", () => {
     const itemId = getRssItemId(items[0]);
-    console.log(itemId);
     expect(itemId).toBe('https://www.artnews.com/?p=1234682122');
   });
 
@@ -51,6 +72,26 @@ describe('rss import function', () => {
     );
     expect(transformedItem.sourceId).toBe(sourceId);
     expect(transformedItem.title).toBe(strippedTitle);
-    expect(transformedItem.keywords).toBe('Art in America, Reviews');
+    expect(transformedItem.keywords).toEqual(['Art in America', 'Reviews']);
+  });
+
+  it('should return the transformed rss for nytimes', () => {
+    const sourceId = 'nytimes';
+    const transformedItem = transformRssItem(nytItemXml, sourceId);
+    expect(transformedItem.sourceId).toBe(sourceId);
+    expect(transformedItem.title).toBe(
+      'How 1993 — and Two Watershed Shows — Help Make Sense of 2023'
+    );
+    expect(transformedItem.keywords).toEqual([
+      'Art',
+      'Photography',
+      'Race and Ethnicity',
+      'Hauser & Wirth',
+      'Fowle, Kate',
+      'Los Angeles (Calif)',
+      'Gaines, Charles',
+      'Whitney Museum of American Art',
+      'University of California',
+    ]);
   });
 });
diff --git a/app/api/import/rss/route.ts b/app/api/import/rss/route.ts
@@ -1,4 +1,4 @@
-import { NextResponse } from 'next/server';
+import { NextResponse, type NextRequest } from 'next/server';
 
 import updateRssFeeds from '@/lib/import/updateRssFeed';
 
@@ -27,15 +27,6 @@ import updateRssFeeds from '@/lib/import/updateRssFeed';
  *                   type: boolean
  *                 message:
  *                   type: string
- *       400:
- *         description: API_SECRET environment variable not set or other bad request.
- *         content:
- *           application/json:
- *             schema:
- *               type: object
- *               properties:
- *                 error:
- *                   type: string
  *       401:
  *         description: Unauthorized request due to invalid secret.
  *         content:
@@ -55,19 +46,13 @@ import updateRssFeeds from '@/lib/import/updateRssFeed';
  *                 error:
  *                   type: object
  */
-export async function GET(request: Request) {
-  const { searchParams } = new URL(request.url);
-  const secret = searchParams.get('secret');
-  const realSecret = process.env.API_SECRET;
-
-  if (!realSecret)
-    return NextResponse.json(
-      { error: 'API_SECRET environment variable not set' },
-      { status: 400 }
-    );
-
-  if (secret !== realSecret)
-    return NextResponse.json({ error: 'Unauthorized' }, { status: 401 });
+export async function GET(request: NextRequest) {
+  const authHeader = request.headers.get('authorization');
+  if (authHeader !== `Bearer ${process.env.CRON_SECRET}`) {
+    return new Response('Unauthorized', {
+      status: 401,
+    });
+  }
 
   try {
     await updateRssFeeds();

diff --git a/components/ui/popover-local.tsx b/components/ui/popover-local.tsx
diff --git a/config/site.ts b/config/site.ts
@@ -191,7 +191,7 @@ export const siteConfig: SiteConfig = {
     {
       ingester: 'rssIngester',
       sourceId: 'artforum',
-      url: 'https://www.artforum.com/rss.xml',
+      url: 'https://www.artforum.com/feed/',
     },
     /*
     LACMA is using their RSS feed for all events, filling up the timeline.

diff --git a/docs/API.md b/docs/API.md
@@ -3,6 +3,7 @@
 API endpoints for search, document retrieval, and RSS feed updates. Next.js API Routes with Route Handlers located in `/app/api` directory.
 
 [Search](#search)
+
 - [GET `/api/search`](#apisearch): Search documents
 - [GET `/api/search/document`](#apisearchdocument): Get a document
 - [GET `/api/search/options`](#apisearchoptions): Get agg options
@@ -11,6 +12,7 @@ API endpoints for search, document retrieval, and RSS feed updates. Next.js API
 - [GET `/api/search/terms`](#apisearchterms): Get terms
 
 [Sync](#sync)
+
 - [GET `/api/import/rss`](#apiimportrss): Import/upsert RSS feeds
 
 ## Search
@@ -347,7 +349,7 @@ API endpoints for search, document retrieval, and RSS feed updates. Next.js API
 
 ### `/api/import/rss`
 
-**GET**: Updates RSS feeds. TODO: Insecure due to "secret" key.
+**GET**: Updates RSS feeds. Requires process.env.CRON_SECRET for authentication.
 
 - **Summary**: Updates RSS feeds
 - **Description**: Endpoint to update RSS feeds. Requires secret for authentication.
@@ -371,5 +373,3 @@ API endpoints for search, document retrieval, and RSS feed updates. Next.js API
       - **type**: boolean
     - **message**:
       - **type**: string
-
-**400**: API_SECRET environment variable not set or other bad request.
diff --git a/docs/SCHEMA.md b/docs/SCHEMA.md
@@ -27,7 +27,7 @@ Data has been collected from a number of sources, and more sources will be added
 - [Collossal](https://www.thisiscolossal.com): [Feed](https://www.thisiscolossal.com/feed/)
 - [Hi-Fructose](https://hifructose.com): [Feed](https://hifructose.com/feed/)
 - [Juxtapoz](https://www.juxtapoz.com): [Feed](https://www.juxtapoz.com/news/?format=feed&type=rss)
-- [Artforum](https://www.artforum.com): [Feed](https://www.artforum.com/rss.xml)
+- [Artforum](https://www.artforum.com): [Feed](https://www.artforum.com/feed/)
 - [LACMA](https://www.lacma.org): [Feed](https://www.lacma.org/rss.xml)
 - [Aesthetica](https://aestheticamagazine.com): [Feed](https://aestheticamagazine.com/feed/)
 - [New Yorker Daily Cartoon](https://www.newyorker.com): [Feed](https://www.newyorker.com/feed/cartoons/daily-cartoon)
@@ -140,7 +140,7 @@ The base document defines common fields for all indices, these are the fields us
 - `boostedKeywords` - An array of keywords that should be boosted in search results
 - `primaryConstituent` - The primary constituent of the document, e.g. the artist of a painting.
 - `image` - Image. The main image of the document
-- `date` - For RSS feeds, the date of the entry.  For events, the start date of the event.  Not used for artworks.  TODO: Refactor to `startDate` or something clearer.
+- `date` - For RSS feeds, the date of the entry. For events, the start date of the event. Not used for artworks. TODO: Refactor to `startDate` or something clearer.
 - `formattedDate` - A string representing the date, no strict format.
 - `startYear` - An integer representing the start date year. Used for year range filtering.
 - `endYear` - An integer representing the end date year. Used for year range filtering.

diff --git a/docs/SETUP.md b/docs/SETUP.md
@@ -52,7 +52,7 @@ For local development, add a local `.env.local` file in the root directory. If `
 
 On [Formspree](https://formspree.io/) you should set up a basic contact form and enter the `FORMSPREE_FORM_ID` env variable.
 
-OpenAI & Google vars only necessary if using OpenAI to extract exhibition & event data from URL's.  In that case, a Google sheet is used to store the extracted content.
+OpenAI & Google vars only necessary if using OpenAI to extract exhibition & event data from URL's. In that case, a Google sheet is used to store the extracted content.
 
 For cloud deployments (for example on Vercel), add the same variables to the Environment Variables of your deployment.
 
@@ -67,7 +67,7 @@ ELASTICSEARCH_PORT=9200
 ELASTICSEARCH_CA_FILE=./secrets/es01.crt
 ELASTICSEARCH_API_KEY=MVE2aWxZUIJBWkNOUzYwU1ZKbUg6dEllY4JjQkVTZ3lFWlU3RRdLUm5mQQ==
 ELASTICSEARCH_BULK_LIMIT=2000
-API_SECRET=dfJtqJDG9VwN69edUU283qnD
+CRON_SECRET=supersecretrandomstringover16characters
 NEXT_PUBLIC_IMAGE_DOMAIN=rx3rxq8hyni2c.cloudfront.net
 PROCESS_IMAGES=true
 FORMSPREE_FORM_ID=rwejcdbw
@@ -122,9 +122,11 @@ For automatic syncing of RSS feeds, add a cron job to your hosting service. For
 {
   "crons": [
     {
-      "path": "/api/import/rss?secret=dfJtqJDG9VwN69edUU283qnD",
+      "path": "/api/import/rss",
       "schedule": "0 * * * *"
     }
   ]
 }
 ```
+
+Please see [Securing cron jobs](https://vercel.com/docs/cron-jobs/manage-cron-jobs#securing-cron-jobs) for more information on securing cron jobs using Vercel's CRON_SECRET environment variable sent as an Authorization header.
diff --git a/lib/import/ingesters/rss/util.ts b/lib/import/ingesters/rss/util.ts
@@ -1,8 +1,34 @@
+import { type } from 'os';
 import { format, getYear, isValid, parse } from 'date-fns';
+import { parseStringPromise } from 'xml2js';
 
 import type { BaseDocument, DocumentConstituent } from '@/types/document';
 import { stripHtmlTags } from '@/lib/various';
 
+/**
+ * Sometimes RSS feeds contain invalid XML characters, such as '&', e.g.:
+ * <category domain="http://www.nytimes.com/namespaces/keywords/nyt_org">Hauser & Wirth</category>
+ *
+ * @param xmlStr XML string
+ * @returns XML string with invalid characters escaped
+ */
+export function escapeXmlInvalidChars(xmlStr: string): string {
+  const regex = /&(?!amp;|lt;|gt;|quot;|apos;|#\d+;)/g;
+  return xmlStr.replace(regex, '&amp;');
+}
+
+/**
+ * Parse an XML string into a JSON object
+ *
+ * @param xmlString XML string
+ * @returns JSON object
+ */
+export async function parseXml(xmlString: string): Promise<any> {
+  // Parse the XML string using xml2js
+  const jsonObj = await parseStringPromise(escapeXmlInvalidChars(xmlString));
+  return jsonObj;
+}
+
 /**
  * Get the image url from the rss item, either from the description or the content
  * @param item RSS <item> element
@@ -64,17 +90,33 @@ export function parseDate(item: any): Date {
   throw new Error('RSS Item Date could not be parsed');
 }
 
+/**
+ * Two cases:
+ * 1. Array of categories with string values:
+ * <category><![CDATA[Art in America]]></category>
+ * 2. Array of categories with object values:
+ * <category domain="http://www.nytimes.com/namespaces/keywords/des">Art</category>
+ *
+ * @param item RSS <item> element
+ * @returns string[] of categories
+ */
+export function getItemCategories(item: any): string[] | undefined {
+  if (!item.category || !(item.category.length > 0)) return undefined;
+  const categories = item.category?.map((c: any) => {
+    if (typeof c === 'string') return c;
+    if (typeof c === 'object' && c._) return c._;
+  });
+  return categories;
+}
+
 /**
  * Transform a typical RSS item into a BaseDocument
  *
  * @param item RSS <item> element
  * @param sourceId ID of the source
  * @returns Elasticsearch BaseDocument
  */
-export function transformRssItem(
-  item: any,
-  sourceId: string
-) {
+export function transformRssItem(item: any, sourceId: string) {
   const title = stripHtmlTags(item.title?.[0]);
   const description = stripHtmlTags(item.description?.[0]);
   const searchText = stripHtmlTags(item['content:encoded']?.[0]);
@@ -91,7 +133,7 @@ export function transformRssItem(
     title,
     description,
     searchText,
-    keywords: item.category?.length ? item.category : undefined,
+    keywords: getItemCategories(item),
     image: {
       url: thumbnailUrl,
       thumbnailUrl: thumbnailUrl,

diff --git a/vercel.json b/vercel.json
@@ -1,8 +1,8 @@
 {
   "crons": [
     {
-      "path": "/api/import/rss?secret=dfJtqJDG9VwN69edUU283qnD",
+      "path": "/api/import/rss",
       "schedule": "0 * * * *"
     }
   ]
-}
+}