Skip to content

Commit

Permalink
Change ingestion input to JSON lines format (#639)
Browse files Browse the repository at this point in the history
Signed-off-by: Tyler Ohlsen <[email protected]>
(cherry picked from commit a484199)
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
github-actions[bot] committed Feb 24, 2025
1 parent ccfbb9e commit 0374d0b
Show file tree
Hide file tree
Showing 16 changed files with 285 additions and 32 deletions.
1 change: 1 addition & 0 deletions common/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ export const ML_RESPONSE_PROCESSOR_EXAMPLE_DOCS_LINK =
'https://opensearch.org/docs/latest/search-plugins/search-pipelines/ml-inference-search-response/#example-externally-hosted-text-embedding-model';
export const UPDATE_MODEL_DOCS_LINK =
'https://opensearch.org/docs/latest/ml-commons-plugin/api/model-apis/update-model/';
export const JSONLINES_LINK = 'https://jsonlines.org/';

// Large Language Models Documentation Links
export const BEDROCK_CLAUDE_3_SONNET_DOCS_LINK =
Expand Down
1 change: 1 addition & 0 deletions common/interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ export type ConfigFieldType =
| 'json'
| 'jsonArray'
| 'jsonString'
| 'jsonLines'
| 'select'
| 'model'
| 'map'
Expand Down
4 changes: 4 additions & 0 deletions common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ export function customStringify(jsonObj: {} | []): string {
return JSON.stringify(jsonObj, undefined, 2);
}

export function customStringifySingleLine(jsonObj: {}): string {
return JSON.stringify(jsonObj, undefined, 0);
}

export function isVectorSearchUseCase(workflow: Workflow | undefined): boolean {
return (
workflow?.ui_metadata?.type !== undefined &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ export function SourceData(props: SourceDataProps) {
// empty/populated docs state
let docs = [];
try {
docs = JSON.parse(getIn(values, 'ingest.docs', []));
const lines = getIn(values, 'ingest.docs', '').split('\n') as string[];
lines.forEach((line) => docs.push(JSON.parse(line)));
} catch {}
const docsPopulated = docs.length > 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,17 @@ import {
EuiSmallButtonEmpty,
EuiButtonGroup,
EuiCompressedComboBox,
EuiLink,
} from '@elastic/eui';
import { JsonField } from '../input_fields';
import { JsonLinesField } from '../input_fields';
import {
customStringify,
customStringifySingleLine,
FETCH_ALL_QUERY_LARGE,
IConfigField,
IndexMappings,
IngestDocsFormValues,
isVectorSearchUseCase,
JSONLINES_LINK,
MAX_BYTES_FORMATTED,
MAX_DOCS_TO_IMPORT,
SearchHit,
Expand Down Expand Up @@ -72,11 +74,11 @@ export function SourceDataModal(props: SourceDataProps) {

// sub-form values/schema
const docsFormValues = {
docs: getInitialValue('jsonArray'),
docs: getInitialValue('jsonLines'),
} as IngestDocsFormValues;
const docsFormSchema = yup.object({
docs: getFieldSchema({
type: 'jsonArray',
type: 'jsonLines',
} as IConfigField),
}) as yup.Schema;

Expand Down Expand Up @@ -177,8 +179,14 @@ export function SourceDataModal(props: SourceDataProps) {
.then((resp) => {
const docObjs = resp?.hits?.hits
?.slice(0, MAX_DOCS_TO_IMPORT)
?.map((hit: SearchHit) => hit?._source);
formikProps.setFieldValue('docs', customStringify(docObjs));
?.map((hit: SearchHit) => hit?._source) as {}[];
let jsonLinesStr = '';
try {
docObjs.forEach((docObj) => {
jsonLinesStr += customStringifySingleLine(docObj) + '\n';
});
} catch {}
formikProps.setFieldValue('docs', jsonLinesStr);
});
}
}, [selectedIndex]);
Expand Down Expand Up @@ -234,7 +242,7 @@ export function SourceDataModal(props: SourceDataProps) {
{props.selectedOption === SOURCE_OPTIONS.UPLOAD && (
<>
<EuiCompressedFilePicker
accept="application/json"
accept=".jsonl"
multiple={false}
initialPromptText="Upload file"
onChange={(files) => {
Expand All @@ -247,6 +255,7 @@ export function SourceDataModal(props: SourceDataProps) {
'docs',
e.target.result as string
);
formikProps.setFieldTouched('docs');
}
};
fileReader.readAsText(files[0]);
Expand Down Expand Up @@ -286,12 +295,20 @@ export function SourceDataModal(props: SourceDataProps) {
<EuiSpacer size="xs" />
</>
)}
<JsonField
<JsonLinesField
label="Documents to be imported"
fieldPath={'docs'}
helpText="Documents must be in a JSON array format."
helpText={
<EuiText size="s">
Documents must be in JSON lines format.{' '}
<EuiLink href={JSONLINES_LINK} target="_blank">
Learn more
</EuiLink>
</EuiText>
}
editorHeight="40vh"
readOnly={false}
validate={true}
/>
</>
</EuiModalBody>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

export { TextField } from './text_field';
export { JsonField } from './json_field';
export { JsonLinesField } from './json_lines_field';
export { ModelField } from './model_field';
export { MapField } from './map_field';
export { MapArrayField } from './map_array_field';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ interface JsonFieldProps {
* in some custom JSON
*/
export function JsonField(props: JsonFieldProps) {
const validate = props.validate !== undefined ? props.validate : true;
const validate = props.validate ?? true;

const { errors, touched, values } = useFormikContext<WorkflowFormValues>();

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

import React, { ReactNode, useEffect, useState } from 'react';
import { Field, FieldProps, getIn, useFormikContext } from 'formik';
import { isEmpty } from 'lodash';
import {
EuiCodeEditor,
EuiCompressedFormRow,
EuiLink,
EuiText,
} from '@elastic/eui';
import {
customStringifySingleLine,
WorkflowFormValues,
} from '../../../../../common';
import { camelCaseToTitleString } from '../../../../utils';

interface JsonLinesFieldProps {
fieldPath: string; // the full path in string-form to the field (e.g., 'ingest.enrich.processors.text_embedding_processor.inputField')
validate?: boolean;
label?: string;
helpLink?: string;
helpText?: string | ReactNode;
editorHeight?: string;
readOnly?: boolean;
}

/**
* An input field for a component where users input data in JSON Lines format.
* https://jsonlines.org/
*/
export function JsonLinesField(props: JsonLinesFieldProps) {
const validate = props.validate ?? true;

const { errors, touched, values } = useFormikContext<WorkflowFormValues>();

// temp input state. only format when users click out of the code editor
const [jsonStr, setJsonStr] = useState<string>('{}');
const [customErrMsg, setCustomErrMsg] = useState<string | undefined>(
undefined
);

// initializing the text to be the stringified form value
useEffect(() => {
if (props.fieldPath && values) {
const formValue = getIn(values, props.fieldPath) as string;
if (formValue) {
setJsonStr(formValue);
}
}
}, [props.fieldPath, values]);

return (
<Field name={props.fieldPath}>
{({ field, form }: FieldProps) => {
return (
<EuiCompressedFormRow
fullWidth={true}
key={props.fieldPath}
label={props.label || camelCaseToTitleString(field.name)}
labelAppend={
props.helpLink ? (
<EuiText size="xs">
<EuiLink href={props.helpLink} target="_blank">
Learn more
</EuiLink>
</EuiText>
) : undefined
}
helpText={props.helpText || undefined}
error={
validate ? (
<>
{customErrMsg?.split('\n')?.map((errMsg, idx) => {
return (
<EuiText key={idx} color="danger" size="s">
{errMsg}
</EuiText>
);
})}
</>
) : undefined
}
isInvalid={
validate
? getIn(errors, field.name) && getIn(touched, field.name)
: false
}
>
<EuiCodeEditor
mode="hjson"
theme="textmate"
width="100%"
height={props.editorHeight || '15vh'}
value={jsonStr}
onChange={(input) => {
setJsonStr(input);
form.setFieldValue(field.name, input);
setCustomErrMsg(undefined);
}}
onBlur={() => {
form.setFieldTouched(field.name);
let finalJsonStr = '';
let errs = [] as string[];
try {
const lines = jsonStr?.split('\n');
lines.forEach((line: string, idx) => {
if (line.trim() !== '') {
let parsedLine = {};
try {
parsedLine = JSON.parse(line);
} catch (error) {
errs.push(
getFormattedErrorMsg(error as Error, idx + 1)
);
}
if (!isEmpty(parsedLine)) {
finalJsonStr +=
customStringifySingleLine(JSON.parse(line)) + '\n';
}
}
});
// remove trailing newline
if (finalJsonStr !== '') {
finalJsonStr = finalJsonStr.slice(0, -1);
}

if (errs?.length > 0) {
setCustomErrMsg(getFormattedErrorMsgList(errs));
} else {
form.setFieldValue(field.name, finalJsonStr);
setCustomErrMsg(undefined);
}
} catch (error) {}
}}
readOnly={props.readOnly || false}
setOptions={{
fontSize: '14px',
useWorker: false,
highlightActiveLine: !props.readOnly,
highlightSelectedWord: !props.readOnly,
highlightGutterLine: !props.readOnly,
wrap: true,
}}
aria-label="Code Editor"
/>
</EuiCompressedFormRow>
);
}}
</Field>
);
}

// Parse out the useful information from an error triggered during JSON parsing failure
function getFormattedErrorMsg(error: Error, idx: number): string {
return `Error on line ${idx}: ${getIn(error, 'message', 'Invalid JSON')
.replace(/^(.*?)\s+in JSON.*/, '$1')
.replace(/^(.*?)\s+after JSON.*/, '$1')}`;
}

// Verbosely display a few error messages, list the count of remaining ones.
function getFormattedErrorMsgList(errors: string[]): string {
let finalMsg = '';
const verboseErrors = errors.slice(0, 3);
const nonVerboseErrorCount = errors.length - 3;
verboseErrors.forEach((error) => {
finalMsg += error + '\n';
});
if (nonVerboseErrorCount > 0) {
finalMsg += `${nonVerboseErrorCount} more error${
nonVerboseErrorCount > 1 ? 's' : ''
}`;
} else if (finalMsg !== '') {
// remove trailing newline
finalMsg = finalMsg.slice(0, -1);
}
return finalMsg;
}
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ export function ConfigureExpressionModal(props: ConfigureExpressionModalProps) {
const docs = getIn(values, 'ingest.docs');
let docObjs = [] as {}[] | undefined;
try {
docObjs = JSON.parse(docs);
const lines = docs?.split('\n') as string[];
lines.forEach((line) => docObjs?.push(JSON.parse(line)));
} catch {}
const query = getIn(values, 'search.request');
let queryObj = {} as {} | undefined;
Expand Down Expand Up @@ -465,9 +466,13 @@ export function ConfigureExpressionModal(props: ConfigureExpressionModalProps) {
});
} else {
try {
const docObjs = JSON.parse(
values.ingest.docs
) as {}[];
const docObjs = [] as {}[];
const lines = values?.ingest?.docs?.split(
'\n'
) as string[];
lines.forEach((line) =>
docObjs?.push(JSON.parse(line))
);
if (docObjs.length > 0) {
setSourceInput(
customStringify(docObjs[0])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ export function ConfigureMultiExpressionModal(
const docs = getIn(values, 'ingest.docs');
let docObjs = [] as {}[] | undefined;
try {
docObjs = JSON.parse(docs);
const lines = docs?.split('\n') as string[];
lines.forEach((line) => docObjs?.push(JSON.parse(line)));
} catch {}
const query = getIn(values, 'search.request');
let queryObj = {} as {} | undefined;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ export function ConfigureTemplateModal(props: ConfigureTemplateModalProps) {
const docs = getIn(values, 'ingest.docs');
let docObjs = [] as {}[] | undefined;
try {
docObjs = JSON.parse(docs);
const lines = docs?.split('\n') as string[];
lines.forEach((line) => docObjs?.push(JSON.parse(line)));
} catch {}
const query = getIn(values, 'search.request');
let queryObj = {} as {} | undefined;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,9 @@ export function WorkflowInputs(props: WorkflowInputsProps) {
useEffect(() => {
let parsedDocsObjs = [] as {}[];
try {
parsedDocsObjs = JSON.parse(props.ingestDocs);
} catch (e) {}
const lines = props.ingestDocs?.split('\n') as string[];
lines.forEach((line) => parsedDocsObjs.push(JSON.parse(line)));
} catch {}
setDocsPopulated(parsedDocsObjs.length > 0 && !isEmpty(parsedDocsObjs[0]));
}, [props.ingestDocs]);

Expand Down Expand Up @@ -607,7 +608,8 @@ export function WorkflowInputs(props: WorkflowInputsProps) {
try {
let ingestDocsObjs = [] as {}[];
try {
ingestDocsObjs = JSON.parse(props.ingestDocs);
const lines = props.ingestDocs?.split('\n') as string[];
lines.forEach((line) => ingestDocsObjs.push(JSON.parse(line)));
} catch (e) {}
if (ingestDocsObjs.length > 0 && !isEmpty(ingestDocsObjs[0])) {
success = await validateAndUpdateWorkflow(false, true, false);
Expand Down
Loading

0 comments on commit 0374d0b

Please sign in to comment.