Skip to content

Commit

Permalink
Merge pull request #36 from derekphilipau/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
derekphilipau authored Dec 29, 2023
2 parents 0f86ed4 + 19ebdac commit 55eef36
Show file tree
Hide file tree
Showing 11 changed files with 120 additions and 80 deletions.
8 changes: 6 additions & 2 deletions .env.local.sample
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ ELASTICSEARCH_PORT=9200
ELASTICSEARCH_CA_FILE=./secrets/es01.crt
ELASTICSEARCH_API_KEY=MVE2aWxZUIJBWkNOUzYwU1ZKbUg6dEllY4JjQkVTZ3lFWlU3RRdLUm5mQQ==
ELASTICSEARCH_BULK_LIMIT=2000
API_SECRET=dfJtqJDG9VwN69edUU283qnD
NEXT_PUBLIC_IMAGE_DOMAIN=rx3rxq8hyni2c.cloudfront.net
CRON_SECRET=supersecretrandomstringover16characters
AWS_ACCESS_KEY_ID=myaccesskey
AWS_SECRET_ACCESS_KEY=mysecretkey
AWS_REGION=myawsregion
AWS_BUCKET_NAME=mybucketname
NEXT_PUBLIC_IMAGE_DOMAIN=mydomain.cloudfront.net
PROCESS_IMAGES=true
FORMSPREE_FORM_ID=rwejcdbw
OPENAI_API_KEY=sk-231rZaTl2w4MuRPOrsT1T9BlckFJes7O2D1RIOqEkvV2SEAZ
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/playwright-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ jobs:
BASE_URL: ${{ github.event.deployment_status.environment_url }}
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_BUCKET_NAME: ${{ secrets.AWS_BUCKET_NAME }}
API_SECRET: ${{ secrets.API_SECRET }}
ELASTICSEARCH_CLOUD_ID: ${{ secrets.ELASTICSEARCH_CLOUD_ID }}
ELASTICSEARCH_CLOUD_USERNAME: ${{ secrets.ELASTICSEARCH_CLOUD_USERNAME }}
ELASTICSEARCH_CLOUD_PASSWORD: ${{ secrets.ELASTICSEARCH_CLOUD_PASSWORD }}
Expand Down
51 changes: 46 additions & 5 deletions __tests__/lib/import/ingesters/rss/util.test.ts
Original file line number Diff line number Diff line change
@@ -1,28 +1,50 @@
import fs from 'fs';
import path from 'path';
import { parseStringPromise } from 'xml2js';

import {
getRssItemId,
getRssItemImageUrl,
parseDate,
parseXml,
transformRssItem,
} from '@/lib/import/ingesters/rss/util';
import { stripHtmlTags } from '@/lib/various';

describe('rss import function', () => {
describe('transformRssItem', () => {
// Read the content of the RSS feed from the static file
const testContent = fs.readFileSync(
path.join(__dirname, './artnews.rss.xml'),
'utf8'
);

let items: any[] = [];
let nytItemXml: any = {};

// Use beforeAll to process async operations before tests
beforeAll(async () => {
const jsonObj = await parseStringPromise(testContent);
const jsonObj = await parseXml(testContent);
items = jsonObj.rss.channel[0].item;

const nytItemRss = `
<item>
<title>How 1993 — and Two Watershed Shows — Help Make Sense of 2023</title>
<link>https://www.nytimes.com/2023/12/27/arts/design/1993-whitney-biennial-theater-of-refusal-art.html</link>
<guid isPermaLink="true">https://www.nytimes.com/2023/12/27/arts/design/1993-whitney-biennial-theater-of-refusal-art.html</guid>
<atom:link href="https://www.nytimes.com/2023/12/27/arts/design/1993-whitney-biennial-theater-of-refusal-art.html" rel="standout"/>
<description>A blue-chip gallery asks, does the infamous Whitney Biennial or “The Theater of Refusal” measure up 30 years later, when artists of color have moved to the mainstream?</description>
<dc:creator>Aruna D’Souza</dc:creator>
<pubDate>Wed, 27 Dec 2023 16:51:11 +0000</pubDate>
<category domain="http://www.nytimes.com/namespaces/keywords/des">Art</category>
<category domain="http://www.nytimes.com/namespaces/keywords/des">Photography</category>
<category domain="http://www.nytimes.com/namespaces/keywords/des">Race and Ethnicity</category>
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_org">Hauser & Wirth</category>
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_per">Fowle, Kate</category>
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_geo">Los Angeles (Calif)</category>
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_per">Gaines, Charles</category>
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_org">Whitney Museum of American Art</category>
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_org">University of California</category>
</item>`;
nytItemXml = (await parseXml(nytItemRss))?.item;
});

it('should return image for content embedded', () => {
Expand All @@ -34,7 +56,6 @@ describe('rss import function', () => {

it("should return the rss item's id", () => {
const itemId = getRssItemId(items[0]);
console.log(itemId);
expect(itemId).toBe('https://www.artnews.com/?p=1234682122');
});

Expand All @@ -51,6 +72,26 @@ describe('rss import function', () => {
);
expect(transformedItem.sourceId).toBe(sourceId);
expect(transformedItem.title).toBe(strippedTitle);
expect(transformedItem.keywords).toBe('Art in America, Reviews');
expect(transformedItem.keywords).toEqual(['Art in America', 'Reviews']);
});

it('should return the transformed rss for nytimes', () => {
const sourceId = 'nytimes';
const transformedItem = transformRssItem(nytItemXml, sourceId);
expect(transformedItem.sourceId).toBe(sourceId);
expect(transformedItem.title).toBe(
'How 1993 — and Two Watershed Shows — Help Make Sense of 2023'
);
expect(transformedItem.keywords).toEqual([
'Art',
'Photography',
'Race and Ethnicity',
'Hauser & Wirth',
'Fowle, Kate',
'Los Angeles (Calif)',
'Gaines, Charles',
'Whitney Museum of American Art',
'University of California',
]);
});
});
31 changes: 8 additions & 23 deletions app/api/import/rss/route.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { NextResponse } from 'next/server';
import { NextResponse, type NextRequest } from 'next/server';

import updateRssFeeds from '@/lib/import/updateRssFeed';

Expand Down Expand Up @@ -27,15 +27,6 @@ import updateRssFeeds from '@/lib/import/updateRssFeed';
* type: boolean
* message:
* type: string
* 400:
* description: API_SECRET environment variable not set or other bad request.
* content:
* application/json:
* schema:
* type: object
* properties:
* error:
* type: string
* 401:
* description: Unauthorized request due to invalid secret.
* content:
Expand All @@ -55,19 +46,13 @@ import updateRssFeeds from '@/lib/import/updateRssFeed';
* error:
* type: object
*/
export async function GET(request: Request) {
const { searchParams } = new URL(request.url);
const secret = searchParams.get('secret');
const realSecret = process.env.API_SECRET;

if (!realSecret)
return NextResponse.json(
{ error: 'API_SECRET environment variable not set' },
{ status: 400 }
);

if (secret !== realSecret)
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 });
export async function GET(request: NextRequest) {
const authHeader = request.headers.get('authorization');
if (authHeader !== `Bearer ${process.env.CRON_SECRET}`) {
return new Response('Unauthorized', {
status: 401,
});
}

try {
await updateRssFeeds();
Expand Down
33 changes: 0 additions & 33 deletions components/ui/popover-local.tsx

This file was deleted.

2 changes: 1 addition & 1 deletion config/site.ts
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ export const siteConfig: SiteConfig = {
{
ingester: 'rssIngester',
sourceId: 'artforum',
url: 'https://www.artforum.com/rss.xml',
url: 'https://www.artforum.com/feed/',
},
/*
LACMA is using their RSS feed for all events, filling up the timeline.
Expand Down
6 changes: 3 additions & 3 deletions docs/API.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
API endpoints for search, document retrieval, and RSS feed updates. Next.js API Routes with Route Handlers located in `/app/api` directory.

[Search](#search)

- [GET `/api/search`](#apisearch): Search documents
- [GET `/api/search/document`](#apisearchdocument): Get a document
- [GET `/api/search/options`](#apisearchoptions): Get agg options
Expand All @@ -11,6 +12,7 @@ API endpoints for search, document retrieval, and RSS feed updates. Next.js API
- [GET `/api/search/terms`](#apisearchterms): Get terms

[Sync](#sync)

- [GET `/api/import/rss`](#apiimportrss): Import/upsert RSS feeds

## Search
Expand Down Expand Up @@ -347,7 +349,7 @@ API endpoints for search, document retrieval, and RSS feed updates. Next.js API

### `/api/import/rss`

**GET**: Updates RSS feeds. TODO: Insecure due to "secret" key.
**GET**: Updates RSS feeds. Requires process.env.CRON_SECRET for authentication.

- **Summary**: Updates RSS feeds
- **Description**: Endpoint to update RSS feeds. Requires secret for authentication.
Expand All @@ -371,5 +373,3 @@ API endpoints for search, document retrieval, and RSS feed updates. Next.js API
- **type**: boolean
- **message**:
- **type**: string

**400**: API_SECRET environment variable not set or other bad request.
4 changes: 2 additions & 2 deletions docs/SCHEMA.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Data has been collected from a number of sources, and more sources will be added
- [Collossal](https://www.thisiscolossal.com): [Feed](https://www.thisiscolossal.com/feed/)
- [Hi-Fructose](https://hifructose.com): [Feed](https://hifructose.com/feed/)
- [Juxtapoz](https://www.juxtapoz.com): [Feed](https://www.juxtapoz.com/news/?format=feed&type=rss)
- [Artforum](https://www.artforum.com): [Feed](https://www.artforum.com/rss.xml)
- [Artforum](https://www.artforum.com): [Feed](https://www.artforum.com/feed/)
- [LACMA](https://www.lacma.org): [Feed](https://www.lacma.org/rss.xml)
- [Aesthetica](https://aestheticamagazine.com): [Feed](https://aestheticamagazine.com/feed/)
- [New Yorker Daily Cartoon](https://www.newyorker.com): [Feed](https://www.newyorker.com/feed/cartoons/daily-cartoon)
Expand Down Expand Up @@ -140,7 +140,7 @@ The base document defines common fields for all indices, these are the fields us
- `boostedKeywords` - An array of keywords that should be boosted in search results
- `primaryConstituent` - The primary constituent of the document, e.g. the artist of a painting.
- `image` - Image. The main image of the document
- `date` - For RSS feeds, the date of the entry. For events, the start date of the event. Not used for artworks. TODO: Refactor to `startDate` or something clearer.
- `date` - For RSS feeds, the date of the entry. For events, the start date of the event. Not used for artworks. TODO: Refactor to `startDate` or something clearer.
- `formattedDate` - A string representing the date, no strict format.
- `startYear` - An integer representing the start date year. Used for year range filtering.
- `endYear` - An integer representing the end date year. Used for year range filtering.
Expand Down
8 changes: 5 additions & 3 deletions docs/SETUP.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ For local development, add a local `.env.local` file in the root directory. If `

On [Formspree](https://formspree.io/) you should set up a basic contact form and enter the `FORMSPREE_FORM_ID` env variable.

OpenAI & Google vars only necessary if using OpenAI to extract exhibition & event data from URL's. In that case, a Google sheet is used to store the extracted content.
OpenAI & Google vars only necessary if using OpenAI to extract exhibition & event data from URL's. In that case, a Google sheet is used to store the extracted content.

For cloud deployments (for example on Vercel), add the same variables to the Environment Variables of your deployment.

Expand All @@ -67,7 +67,7 @@ ELASTICSEARCH_PORT=9200
ELASTICSEARCH_CA_FILE=./secrets/es01.crt
ELASTICSEARCH_API_KEY=MVE2aWxZUIJBWkNOUzYwU1ZKbUg6dEllY4JjQkVTZ3lFWlU3RRdLUm5mQQ==
ELASTICSEARCH_BULK_LIMIT=2000
API_SECRET=dfJtqJDG9VwN69edUU283qnD
CRON_SECRET=supersecretrandomstringover16characters
NEXT_PUBLIC_IMAGE_DOMAIN=rx3rxq8hyni2c.cloudfront.net
PROCESS_IMAGES=true
FORMSPREE_FORM_ID=rwejcdbw
Expand Down Expand Up @@ -122,9 +122,11 @@ For automatic syncing of RSS feeds, add a cron job to your hosting service. For
{
"crons": [
{
"path": "/api/import/rss?secret=dfJtqJDG9VwN69edUU283qnD",
"path": "/api/import/rss",
"schedule": "0 * * * *"
}
]
}
```

Please see [Securing cron jobs](https://vercel.com/docs/cron-jobs/manage-cron-jobs#securing-cron-jobs) for more information on securing cron jobs using Vercel's CRON_SECRET environment variable sent as an Authorization header.
52 changes: 47 additions & 5 deletions lib/import/ingesters/rss/util.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,34 @@
import { type } from 'os';
import { format, getYear, isValid, parse } from 'date-fns';
import { parseStringPromise } from 'xml2js';

import type { BaseDocument, DocumentConstituent } from '@/types/document';
import { stripHtmlTags } from '@/lib/various';

/**
* Sometimes RSS feeds contain invalid XML characters, such as '&', e.g.:
* <category domain="http://www.nytimes.com/namespaces/keywords/nyt_org">Hauser & Wirth</category>
*
* @param xmlStr XML string
* @returns XML string with invalid characters escaped
*/
export function escapeXmlInvalidChars(xmlStr: string): string {
const regex = /&(?!amp;|lt;|gt;|quot;|apos;|#\d+;)/g;
return xmlStr.replace(regex, '&amp;');
}

/**
* Parse an XML string into a JSON object
*
* @param xmlString XML string
* @returns JSON object
*/
export async function parseXml(xmlString: string): Promise<any> {
// Parse the XML string using xml2js
const jsonObj = await parseStringPromise(escapeXmlInvalidChars(xmlString));
return jsonObj;
}

/**
* Get the image url from the rss item, either from the description or the content
* @param item RSS <item> element
Expand Down Expand Up @@ -64,17 +90,33 @@ export function parseDate(item: any): Date {
throw new Error('RSS Item Date could not be parsed');
}

/**
* Two cases:
* 1. Array of categories with string values:
* <category><![CDATA[Art in America]]></category>
* 2. Array of categories with object values:
* <category domain="http://www.nytimes.com/namespaces/keywords/des">Art</category>
*
* @param item RSS <item> element
* @returns string[] of categories
*/
export function getItemCategories(item: any): string[] | undefined {
if (!item.category || !(item.category.length > 0)) return undefined;
const categories = item.category?.map((c: any) => {
if (typeof c === 'string') return c;
if (typeof c === 'object' && c._) return c._;
});
return categories;
}

/**
* Transform a typical RSS item into a BaseDocument
*
* @param item RSS <item> element
* @param sourceId ID of the source
* @returns Elasticsearch BaseDocument
*/
export function transformRssItem(
item: any,
sourceId: string
) {
export function transformRssItem(item: any, sourceId: string) {
const title = stripHtmlTags(item.title?.[0]);
const description = stripHtmlTags(item.description?.[0]);
const searchText = stripHtmlTags(item['content:encoded']?.[0]);
Expand All @@ -91,7 +133,7 @@ export function transformRssItem(
title,
description,
searchText,
keywords: item.category?.length ? item.category : undefined,
keywords: getItemCategories(item),
image: {
url: thumbnailUrl,
thumbnailUrl: thumbnailUrl,
Expand Down
4 changes: 2 additions & 2 deletions vercel.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"crons": [
{
"path": "/api/import/rss?secret=dfJtqJDG9VwN69edUU283qnD",
"path": "/api/import/rss",
"schedule": "0 * * * *"
}
]
}
}

0 comments on commit 55eef36

Please sign in to comment.