Merge pull request #396 from AutomatingSciencePipeline/392-system-logs-larger-than-16mb-cannot-be-written-to-the-database

rhit-windsors · web-flow · commit a4d28d0ff387 · 2025-01-09T05:33:42.000Z
Update results, logs, and zips to use file buckets
diff --git a/apps/backend/modules/mongo.py b/apps/backend/modules/mongo.py
@@ -12,33 +12,33 @@ def verify_mongo_connection(mongoClient: pymongo.MongoClient):
         raise Exception("MongoDB server not available/unreachable") from err
     
 def upload_experiment_aggregated_results(experimentId: str, results: str, mongoClient: pymongo.MongoClient):
-    experimentResultEntry = {"experimentId": experimentId, "resultContent": results}
     # Get the results connection
-    resultsCollection = mongoClient["gladosdb"].results
+    resultsBucket = GridFSBucket(mongoClient["gladosdb"], bucket_name='resultsBucket')
     try:
-        resultId = resultsCollection.insert_one(experimentResultEntry).inserted_id
+        # Encode the results string to bytes
+        results_bytes = results.encode('utf-8')
+        # Now we need to store the results in the GridFS bucket
+        resultId = resultsBucket.upload_from_stream(f"results{experimentId}", results_bytes, metadata={"experimentId": experimentId})
         # return the resultID
-        return resultId
+        return str(resultId)
         
     except Exception as err:
         # Change to generic exception
         raise Exception("Encountered error while storing aggregated results in MongoDB") from err
     
 def upload_experiment_zip(experimentId: str, encoded: Binary, mongoClient: pymongo.MongoClient):
-    experimentZipEntry = {"experimentId": experimentId, "fileContent": encoded}
-    zipCollection = mongoClient["gladosdb"].zips
+    zipsBucket = GridFSBucket(mongoClient["gladosdb"], bucket_name='zipsBucket')
     try:
-        resultZipId = zipCollection.insert_one(experimentZipEntry).inserted_id
-        return resultZipId
+        resultId = zipsBucket.upload_from_stream(f"results{experimentId}.zip", encoded, metadata={"experimentId": experimentId})
+        return str(resultId)
     except Exception as err:
         raise Exception("Encountered error while storing results zip in MongoDB") from err
     
 def upload_log_file(experimentId: str, contents: str, mongoClient: pymongo.MongoClient):
-    logFileEntry = {"experimentId": experimentId, "fileContent": contents}
-    logCollection = mongoClient["gladosdb"].logs
+    logsBucket = GridFSBucket(mongoClient["gladosdb"], bucket_name='logsBucket')
     try:
-        resultId = logCollection.insert_one(logFileEntry).inserted_id
-        return resultId
+        resultId = logsBucket.upload_from_stream(f"log{experimentId}.txt", contents.encode('utf-8'), metadata={"experimentId": experimentId})
+        return str(resultId)
     except Exception as err:
         raise Exception("Encountered error while storing log file in MongoDB") from err
     
diff --git a/apps/frontend/lib/mongodb_funcs.ts b/apps/frontend/lib/mongodb_funcs.ts
@@ -45,12 +45,24 @@ export async function deleteDocumentById(expId: string) {
 
     //Since we found it, make sure to delete data from logs, results, and zips
     const db = client.db(DB_NAME);
-    //Delete logs
-    await db.collection('logs').deleteMany({ "experimentId": expId });
-    //Delete results
-    await db.collection('results').deleteMany({ "experimentId": expId });
-    //Delete zips
-    await db.collection('zips').deleteMany({ "experimentId": expId });
+    //Delete logs from bucket
+    const logsBucket = new GridFSBucket(db, { bucketName: 'logsBucket' });
+    const filesToDelete = await logsBucket.find({ "metadata.experimentId": expId }).toArray();
+    for (const file of filesToDelete) {
+        await logsBucket.delete(file._id);
+    }
+    //Delete results from bucket
+    const resultsBucket = new GridFSBucket(db, { bucketName: 'resultsBucket' });
+    const resultsToDelete = await resultsBucket.find({ "metadata.experimentId": expId }).toArray();
+    for (const file of resultsToDelete) {
+        await resultsBucket.delete(file._id);
+    }
+    //Delete zips from bucket
+    const zipsBucket = new GridFSBucket(db, { bucketName: 'zipsBucket' });
+    const zipsToDelete = await zipsBucket.find({ "metadata.experimentId": expId }).toArray();
+    for (const file of zipsToDelete) {
+        await zipsBucket.delete(file._id);
+    }
 
     return Promise.resolve();
 }
diff --git a/apps/frontend/pages/api/download/csv/[expIdToCsvDownload].tsx b/apps/frontend/pages/api/download/csv/[expIdToCsvDownload].tsx
@@ -1,6 +1,7 @@
 import clientPromise, { DB_NAME, COLLECTION_RESULTS_CSVS } from '../../../../lib/mongodb';
 import { NextApiHandler } from 'next';
 import { ResultsCsv } from '../../../../lib/mongodb_types';
+import { GridFSBucket } from 'mongodb';
 
 // TODO possible to extract a common function/class here? very little varies between this and zip
 
@@ -12,26 +13,44 @@ const mongoCSVHandler: NextApiHandler<ResultsCsv> = async (req, res) => {
 	}
 
 	let results;
+
 	try {
 		const client = await clientPromise;
 		const db = client.db(DB_NAME);
+		const resultsBucket = new GridFSBucket(db, { bucketName: 'resultsBucket' });
+		//First check that the file exists
+		results = await resultsBucket.find({ "metadata.experimentId": expIdToCsvDownload }).toArray();
+		if (results.length === 0) {
+			console.warn(`Experiment ${expIdToCsvDownload} CSV not found`);
+			res.status(404).json({ response: 'Experiment CSV not found' } as any);
+			return;
+		}
+		
+		if (results.length !== 1) {
+			console.warn(`Experiment ${expIdToCsvDownload} CSV not found`);
+			res.status(404).json({ response: 'Experiment CSV not found' } as any);
+		} else {
+			//Download the file
+			const downloadStream = resultsBucket.openDownloadStream(results[0]._id);
+			//This has to return the csv contents
+			const chunks: Buffer[] = [];
+			downloadStream.on('data', (chunk) => {
+				chunks.push(chunk);
+			});
+
+			downloadStream.on('end', () => {
+				const csvContents = Buffer.concat(chunks as unknown as Uint8Array[]).toString('utf-8');
+				var result = { resultContent: csvContents };
+				res.json(result as unknown as ResultsCsv);
+			});
+		}
 
-		results = await db
-			.collection(COLLECTION_RESULTS_CSVS)
-			// TODO correct mongodb typescript type for id
-			.find({ 'experimentId': expIdToCsvDownload as any }).toArray();
 	} catch (error) {
 		const message = 'Failed to download the csv';
 		console.error('Error contacting server: ', error);
 		res.status(500).json({ response: message } as any);
 	}
-	if (results.length !== 1) {
-		console.warn(`Experiment ${expIdToCsvDownload} CSV not found`);
-		res.status(404).json({ response: 'Experiment CSV not found' } as any);
-	} else {
-		// TODO correct way to typescript handle this?
-		res.json(results[0] as unknown as ResultsCsv);
-	}
+
 };
 
 export default mongoCSVHandler;
diff --git a/apps/frontend/pages/api/download/logs/[idOfLogFile].tsx b/apps/frontend/pages/api/download/logs/[idOfLogFile].tsx
@@ -1,3 +1,4 @@
+import { GridFSBucket } from 'mongodb';
 import clientPromise, { COLLECTION_LOGS, DB_NAME } from '../../../../lib/mongodb';
 import { NextApiHandler } from 'next';
 
@@ -11,27 +12,49 @@ const mongoLogHandler: NextApiHandler<String> = async (req, res) => {
 	}
 
 	let results;
+	const client = await clientPromise;
+	const db = client.db(DB_NAME);
+	const logsBucket = new GridFSBucket(db, { bucketName: 'logsBucket' });
 	try {
-		const client = await clientPromise;
-		const db = client.db(DB_NAME);
+		//First check that the file exists
+		results = await logsBucket.find({ "metadata.experimentId": idOfLogFile }).toArray();
+		if (results.length === 0) {
+			console.warn(`Experiment ${idOfLogFile} Log not found`);
+			res.status(404).json({ response: `Experiment Log '${idOfLogFile}' not found. Please contact the GLADOS team for further troubleshooting.` } as any);
+			return;
+		}
+
+		if (results.length !== 1) {
+			console.warn(`Experiment ${idOfLogFile} Log not found`);
+			res.status(404).json({ response: `Experiment Log '${idOfLogFile}' not found. Please contact the GLADOS team for further troubleshooting.` } as any);
+		} else {
+			//Download the file
+			const downloadStream = logsBucket.openDownloadStream(results[0]._id);
+			//This has to return the csv contents
+			const chunks: Buffer[] = [];
+			downloadStream.on('data', (chunk) => {
+				chunks.push(chunk);
+			});
+
+			downloadStream.on('end', () => {
+				const contents = Buffer.concat(chunks as unknown as Uint8Array[]).toString('utf-8');
+				if (contents.length === 0) {
+					console.warn(`Experiment ${idOfLogFile} Log was empty`);
+					res.send(`Experiment Log '${idOfLogFile}' was empty.`);
+				}
+				else {
+					res.send(contents);
+				}
+
+			});
+		}
 
-		results = await db
-			.collection(COLLECTION_LOGS)
-			// TODO correct mongodb typescript type for id
-			.find({ 'experimentId': idOfLogFile as any }).toArray();
 	} catch (error) {
 		const message = 'Failed to download the log file';
 		console.error('Error contacting server: ', error);
 		res.status(500).json({ response: message } as any);
 	}
-	if (results.length !== 1) {
-		console.warn(`Experiment ${idOfLogFile} Log not found`);
-		res.status(404).json({ response: `Experiment Log '${idOfLogFile}' not found. Remember, the production database doesn't have the logs of experiments from dev!` } as any);
-	} else {
-		const result = results[0];
-		const contents = `${result?.fileContent ?? 'The log file was empty, or an error occurred'}`;
-		res.send(contents);
-	}
+
 };
 
 export default mongoLogHandler;
diff --git a/apps/frontend/pages/api/download/zip/[expIdToZipDownload].tsx b/apps/frontend/pages/api/download/zip/[expIdToZipDownload].tsx
@@ -1,6 +1,7 @@
 import clientPromise, { COLLECTION_ZIPS, DB_NAME } from '../../../../lib/mongodb';
 import { NextApiHandler } from 'next';
 import { ProjectZip } from '../../../../lib/mongodb_types';
+import { GridFSBucket } from 'mongodb';
 
 // TODO possible to extract a common function/class here? very little varies between this and csv
 
@@ -15,23 +16,41 @@ const mongoZipHandler: NextApiHandler<ProjectZip> = async (req, res) => {
 	try {
 		const client = await clientPromise;
 		const db = client.db(DB_NAME);
+		const zipsBucket = new GridFSBucket(db, { bucketName: 'zipsBucket' });
+
+		// First check that the file exists
+		results = await zipsBucket.find({ 'metadata.experimentId': expIdToZipDownload }).toArray();
+
+		if (results.length === 0) {
+			console.warn(`Experiment ${expIdToZipDownload} ZIP not found`);
+			res.status(404).json({ response: 'Experiment ZIP not found' } as any);
+			return;
+		}
+
+		// Download the file
+		const downloadStream = zipsBucket.openDownloadStream(results[0]._id);
+		// This has to return the zip contents
+		const chunks: Buffer[] = [];
+		downloadStream.on('data', (chunk) => {
+			chunks.push(chunk);
+		});
+
+		downloadStream.on('end', () => {
+			//Zip contents are b64 encoded and stored in binary format
+			//So we need to concatenate the chunks and return them as a buffer
+			const zipContents = Buffer.concat(chunks as unknown as Uint8Array[]);
+			//Make sure this is returned as a string
+			const zipContentsString = zipContents.toString('base64');
+			var result = { fileContent: zipContentsString };
+			res.json(result as unknown as ProjectZip);
+		});
 
-		results = await db
-			.collection(COLLECTION_ZIPS)
-			// TODO correct mongodb typescript type for id
-			.find({ 'experimentId': expIdToZipDownload as any }).toArray();
 	} catch (error) {
 		const message = 'Failed to download the zip';
 		console.error('Error contacting server: ', error);
 		res.status(500).json({ response: message } as any);
 	}
-	if (results.length !== 1) {
-		console.warn(`Experiment ${expIdToZipDownload} ZIP not found`);
-		res.status(404).json({ response: 'Experiment ZIP not found' } as any);
-	} else {
-		// TODO correct way to typescript handle this?
-		res.json(results[0] as unknown as ProjectZip);
-	}
+
 };
 
 // Remove the file size limit on the response
diff --git a/apps/runner/modules/utils.py b/apps/runner/modules/utils.py
@@ -160,7 +160,7 @@ def _call_backend(url, payload, log_msg):
     try:
         response = requests.post(url, json=payload, timeout=10)
         if response.status_code == 200:
-            result_id = response.json().get('_id')
+            result_id = response.json().get('id')
             if result_id:
                 explogger.info(f"{log_msg}: {result_id}")
             else: