Cleanup

adobecom · Mar 4, 2024 · 77b142a · 77b142a
1 parent 5ed016f
commit 77b142a
Show file tree

Hide file tree

Showing 13 changed files with 366 additions and 205 deletions.
diff --git a/bulk-update/bulk-update.js b/bulk-update/bulk-update.js
@@ -1,6 +1,7 @@
 import fs from 'fs';
 import { fetch } from '@adobe/fetch';
-import { loadDocument, checkLinks } from './document-manager/document-manager.js';
+import { loadDocument } from './document-manager/document-manager.js';
+import { validateMigration } from './validation/validation.js';
 
 const delay = (milliseconds) => new Promise((resolve) => { setTimeout(resolve, milliseconds); });
 
@@ -37,7 +38,7 @@ export async function loadQueryIndex(url, fetchFunction = fetch, fetchWaitMs = 5
     const nextUrl = new URL(url);
     nextUrl.searchParams.set('limit', limit);
     nextUrl.searchParams.set('offset', offset + limit);
-    entries.push(...await loadQueryIndex(nextUrl.toString(), fetchFunction));
+    entries.push(...await loadQueryIndex(nextUrl.toString(), fetchFunction, fetchWaitMs));
   }
 
   return entries;
@@ -52,13 +53,13 @@ export async function loadQueryIndex(url, fetchFunction = fetch, fetchWaitMs = 5
  * @returns {Promise<string[]>} - The loaded data as an array of strings.
  * @throws {Error} - If the list format or entry is unsupported.
  */
-export async function loadListData(source, fetchFunction = fetch) {
+export async function loadListData(source, fetchFunction = fetch, fetchWaitMs = 500) {
   if (!source) return [];
   if (Array.isArray(source) || source.includes(',')) {
     const entries = Array.isArray(source) ? source : source.split(',');
     const loadedEntries = [];
     for (const entry of entries) {
-      const loadedData = await loadListData(entry.trim(), fetchFunction);
+      const loadedData = await loadListData(entry.trim(), fetchFunction, fetchWaitMs);
       if (loadedData) loadedEntries.push(...loadedData);
     }
     return loadedEntries;
@@ -73,38 +74,18 @@ export async function loadListData(source, fetchFunction = fetch) {
   switch (extension) {
     case 'json':
       if (source.startsWith('http')) {
-        return loadQueryIndex(source, fetchFunction);
+        return loadQueryIndex(source, fetchFunction, fetchWaitMs);
       }
-      return loadListData(JSON.parse(fs.readFileSync(source, 'utf8').trim()), fetchFunction);
+      return loadListData(JSON.parse(fs.readFileSync(source, 'utf8').trim()), fetchFunction, fetchWaitMs);
     case 'txt':
-      return loadListData(fs.readFileSync(source, 'utf8').trim().split('\n'), fetchFunction);
+      return loadListData(fs.readFileSync(source, 'utf8').trim().split('\n'), fetchFunction, fetchWaitMs);
     case 'html':
       return [source];
     default:
       throw new Error(`Unsupported list format or entry: ${source}`);
   }
 }
 
-/**
- * Validates the migration by checking the links in the entry against the provided configuration.
- *
- * @param {Object} entry - The entry to validate.
- * @param {Object} config - The configuration object.
- * @returns {Promise<void>} - A promise that resolves once the validation is complete.
- */
-async function validateMigration({ entry }, config) {
-  const links = await checkLinks(entry, config);
-  if (links) {
-    console.log(`Links Match: ${links.match}, ${links.unique.length} unique links found.`);
-    if (links.unique.length) {
-      config?.reporter.log('validation', 'error', 'Unique links found', { entry, count: links.unique.length });
-      console.table(links.unique);
-    }
-  } else {
-    console.log('Could not validate links');
-  }
-}
-
 /**
  * Executes a bulk update operation using the provided migration function
  * Loads data from various sources and executes bulk update operations from the migration function.
@@ -121,10 +102,8 @@ export default async function main(config, migrate, reporter = null) {
     for (const [i, entry] of config.list.entries()) {
       console.log(`Processing entry ${i + 1} of ${config.list.length} ${entry}`);
       const document = await loadDocument(entry, config);
-      const success = await migrate(document);
-      if (success) {
-        await validateMigration(document, config);
-      }
+      await migrate(document);
+      await validateMigration(document, config);
     }
   } catch (e) {
     console.error('Bulk Update Error:', e);

diff --git a/bulk-update/document-manager/document-manager.js b/bulk-update/document-manager/document-manager.js
@@ -3,7 +3,6 @@ import fs from 'fs';
 import { fetch, timeoutSignal, AbortError } from '@adobe/fetch';
 import { mdast2docx } from '@adobe/helix-md2docx';
 import parseMarkdown from '@adobe/helix-html-pipeline/src/steps/parse-markdown.js';
-import { compare } from '../../link-check/linkCompare.js';
 
 const delay = (milliseconds) => new Promise((resolve) => { setTimeout(resolve, milliseconds); });
 const { pathname } = new URL('.', import.meta.url);
@@ -22,22 +21,6 @@ export function entryToPath(entry) {
   return path;
 }
 
-/**
- * Checks links against the original document.
- *
- * @param {string} entry - The entry to check the links for.
- * @param {object} config - The configuration object.
- * @returns {Promise<object>}
- */
-export function checkLinks(entry, config) {
-  const output = `${config.outputDir}${entryToPath(entry)}.docx`;
-  const mdURL = `${config.siteUrl}${entry}.md`;
-
-  if (!fs.existsSync(output)) return false;
-
-  return compare(mdURL, output);
-}
-
 /**
  * Fetches a markdown file from a given URL.
  *

diff --git a/bulk-update/migration-tools/select.js b/bulk-update/migration-tools/select.js
@@ -9,8 +9,7 @@ import { select, selectAll } from 'unist-util-select';
  * @param {string} str - The input block string.
  * @returns {Object} - An object containing the block name and options.
  */
-export const getBlockInfo = (str) => {
-  if (!str) return null;
+export const getBlockInfo = (str = '') => {
   const blockInfo = {};
   const regex = /([\w\s-]+)\s*(?:\(([^)]*)\))?/;
   const match = regex.exec(str.toLowerCase());

diff --git a/bulk-update/validation/images.js b/bulk-update/validation/images.js
@@ -0,0 +1,23 @@
+/* eslint-disable import/prefer-default-export */
+/**
+ * Checks the alt text of images in a markdown string.
+ *
+ * @param {string} markdown - The markdown string to check.
+ * @returns {string[]} - An array of URLs of images with missing alt text.
+ */
+export function checkAltText(markdown) {
+  const regex = /!\[(.*?)\]\((.*?)\)/g;
+  const matches = markdown.match(regex);
+  const missingAltTextUrls = [];
+
+  if (!matches) return missingAltTextUrls;
+
+  for (const match of matches) {
+    const [, altText, url] = match.match(/\[(.*?)\]\((.*?)\)/);
+    if (!altText && url.startsWith('http')) {
+      missingAltTextUrls.push(url);
+    }
+  }
+
+  return missingAltTextUrls;
+}
diff --git a/bulk-update/validation/links.js b/bulk-update/validation/links.js
@@ -0,0 +1,62 @@
+/**
+ * Compares two links and checks if they have the same host and pathname.
+ *
+ * @param {string} link1 - The first link to compare.
+ * @param {string} link2 - The second link to compare.
+ * @returns {boolean} - Returns true if the links have the same host and pathname, otherwise false.
+ */
+export function compareLink(link1, link2, site) {
+  const url1 = new URL(link1.trim(), site);
+  const url2 = new URL(link2.trim(), site);
+
+  return (url1.host === url2.host) && (url1.pathname === url2.pathname);
+}
+/**
+ * Extracts links from markdown content.
+ *
+ * @param {string} content - The markdown content.
+ * @returns {string[]} - An array of links extracted from the content.
+ */
+export function extractLinks(content) {
+  const regex = /\[.*?\]\((.*?)\)/g;
+  const links = [];
+  let match = regex.exec(content);
+  while (match !== null) {
+    const link = match[1];
+    if (link.startsWith('http')) {
+      links.push(link);
+    }
+    match = regex.exec(content);
+  }
+  return links;
+}
+
+/**
+ * Compares two arrays of links and returns an object indicating if they match and the unique links.
+ *
+ * @param {Array} links1 - The first array of links.
+ * @param {Array} links2 - The second array of links.
+ * @returns {Promise<object>} - Match status and unique links.
+ */
+export function compareLinks(links1, links2) {
+  const result = { match: false, unique: [] };
+
+  result.links = links1.map((link1, index) => {
+    const link2 = links2[index];
+    const match = (link1 && link2) ? compareLink(link1, link2) : false;
+
+    return { link: index, link1, link2, match };
+  });
+
+  result.unique = result.links.filter((link) => !link.match);
+  result.match = result.unique.length === 0;
+
+  return result;
+}
+
+export function compareMarkdown(content1, content2, site = 'https://business.adobe.com/') {
+  const links1 = extractLinks(content1);
+  const links2 = extractLinks(content2);
+
+  return compareLinks(links1, links2, site);
+}
diff --git a/bulk-update/validation/validation.js b/bulk-update/validation/validation.js
@@ -0,0 +1,46 @@
+import fs from 'fs';
+import { docx2md } from '@adobe/helix-docx2md';
+import { entryToPath } from '../document-manager/document-manager.js';
+import { compareMarkdown } from './links.js';
+import { checkAltText } from './images.js';
+
+export function checkLinks(md, markdown, reporter, entry) {
+  const links = compareMarkdown(md, markdown);
+
+  if (links) {
+    console.log(`Links Match: ${links.match}, ${links.unique.length} unique links found.`);
+    if (links.unique.length) {
+      reporter?.log('validation', 'error', 'Unique links found', { entry, count: links.unique.length });
+      console.table(links.unique);
+    }
+  } else {
+    console.log('Could not validate links');
+  }
+}
+
+export function checkImages(md, markdown, reporter, entry) {
+  const missingAltText = checkAltText(md, markdown);
+  console.log(`Images Missing Alt Text: ${missingAltText.length}`);
+  if (missingAltText.length > 0) {
+    reporter?.log('validation', 'error', 'Missing alt text', { entry, count: missingAltText.length });
+    console.log(missingAltText);
+  }
+}
+
+export async function validateMigration(document, config) {
+  const { markdown, entry } = document;
+  const { reporter, outputDir } = config;
+  const output = `${outputDir}${entryToPath(entry)}.docx`;
+
+  if (!fs.existsSync(output)) return;
+
+  try {
+    const docx = await fs.promises.readFile(output);
+    const outputMd = await docx2md(docx, { listener: null });
+    checkLinks(outputMd, markdown, reporter, entry);
+    checkImages(outputMd, markdown, reporter, entry);
+  } catch (error) {
+    console.error('Error validating migration:', error);
+    reporter?.log('validation', 'error', 'Error validating migration', { entry, error: error.message });
+  }
+}
diff --git a/link-check/linkCompare.js b/link-check/linkCompare.js
@@ -10,7 +10,6 @@ import { docx2md } from '@adobe/helix-docx2md';
  * @returns {boolean} - Returns true if the links have the same host and pathname, otherwise false.
  */
 export function compareLink(link1, link2) {
-  if (!link1 || !link2) return false;
   const url1 = new URL(link1.trim(), 'https://business.adobe.com/');
   const url2 = new URL(link2.trim(), 'https://business.adobe.com/');
 
@@ -49,54 +48,47 @@ function getFileType(source) {
 
   return source.split('.').pop() || null;
 }
+
 /**
- * Extracts links from content based on a given regex pattern.
+ * Extracts links from markdown content.
  *
- * @param {string} content - The content to extract links from.
- * @param {RegExp} regex - The regex pattern to match links.
+ * @param {string} content - The markdown content.
  * @returns {string[]} - An array of links extracted from the content.
  */
-function findLinks(content, regex, i) {
+function extractLinksFromMarkdown(content) {
+  const regex = /\[.*?\]\((.*?)\)/g;
   const links = [];
   let match = regex.exec(content);
   while (match !== null) {
-    const link = match[i];
-    if (link.startsWith('http')) {
-      links.push(link);
-    }
+    links.push(match[1]);
     match = regex.exec(content);
   }
   return links;
 }
 
-/**
- * Extracts links from markdown content.
- *
- * @param {string} content - The markdown content.
- * @returns {string[]} - An array of links extracted from the content.
- */
-export function extractLinksFromMarkdown(content) {
-  const regex = /\[.*?\]\((.*?)\)/g;
-  return findLinks(content, regex, 1);
-}
-
 /**
  * Extracts links from HTML content.
  *
  * @param {string} content - The HTML content.
  * @returns {string[]} - An array of links extracted from the content.
  */
-export function extractLinksFromHtml(content) {
+function extractLinksFromHtml(content) {
   const regex = /<a\s+(?:[^>]*?\s+)?href=(["'])(.*?)\1/g;
-  return findLinks(content, regex, 2);
+  const links = [];
+  let match = regex.exec(content);
+  while (match !== null) {
+    links.push(match[2]);
+    match = regex.exec(content);
+  }
+  return links;
 }
 
 /**
  * Extracts links from a source based on its file type.
  *
  * @param {string} source - The source URL or file path.
- * @param {Function} [fetchFn=fetch] - The function used to fetch the content from the source.
- * @returns {Promise<string[]>} - An array of links extracted from the source.
+ * @param {string} content - The content of the source.
+ * @returns {string[]} - An array of links extracted from the source.
  * @throws {Error} - Throws an error if the file type is unsupported.
  */
 export async function extractLinks(source, fetchFn = fetch) {
@@ -128,14 +120,12 @@ export async function extractLinks(source, fetchFn = fetch) {
  * @param {Array} links2 - The second array of links.
  * @returns {Promise<object>} - Match status and unique links.
  */
-export function compareLinks(links1, links2) {
+export async function compareLinks(links1, links2) {
   const result = { match: false, unique: [] };
 
   result.links = links1.map((link1, index) => {
     const link2 = links2[index];
-    const match = compareLink(link1, link2);
-
-    return { link: index, link1, link2, match };
+    return { index, link1, link2, match: compareLink(link1, link2) };
   });
 
   result.unique = result.links.filter((link) => !link.match);

diff --git a/test/bulk-update/bulk-update.test.js b/test/bulk-update/bulk-update.test.js
@@ -36,7 +36,7 @@ describe('BulkUpdater', () => {
           ],
         }),
       });
-      const data = await loadListData('https://main--bacom--adobecom.hlx.test/query-index.json', stubFetch);
+      const data = await loadListData('https://main--bacom--adobecom.hlx.test/query-index.json', stubFetch, 0);
 
       expect(data).to.be.an('array');
       expect(data.length).to.equal(1);
@@ -55,7 +55,7 @@ describe('BulkUpdater', () => {
           ],
         }),
       });
-      const data = await loadListData(`${pathname}mock/query-indexes.json`, stubFetch);
+      const data = await loadListData(`${pathname}mock/query-indexes.json`, stubFetch, 0);
 
       expect(data).to.be.an('array');
       expect(data).to.deep.equal(['/test/path1', '/test/path1']);