From 6109131bb4010216fae8b023efb5be0cfc440f1d Mon Sep 17 00:00:00 2001 From: andrekeys Date: Fri, 16 Feb 2024 09:04:47 -0700 Subject: [PATCH 1/5] adding interface --- .../create/GeneBankFileReaderInterface.java | 22 +++++++++++++ src/test/java/cs321/btree/BTreeTest.java | 32 ++----------------- 2 files changed, 24 insertions(+), 30 deletions(-) create mode 100644 src/main/java/cs321/create/GeneBankFileReaderInterface.java diff --git a/src/main/java/cs321/create/GeneBankFileReaderInterface.java b/src/main/java/cs321/create/GeneBankFileReaderInterface.java new file mode 100644 index 0000000..ec5924f --- /dev/null +++ b/src/main/java/cs321/create/GeneBankFileReaderInterface.java @@ -0,0 +1,22 @@ +package cs321.create; + +import java.io.IOException; + +/** + * Reads sequences of length k from each position from a GBK GeneBank file from NCBI. + * Methods are available to return sequences of specified length as a long. + * + * @author CS321 Instructors + */ +public interface GeneBankFileReaderInterface { + + /** + * Gets the next sequence of a given length as a long + * See SequenceUtils for translation utilities + * + * @return the next sequence, formatted as a long + * @throws IOException in case of failed or interrupted I/O + */ + long getNextSequence() throws IOException; + +} diff --git a/src/test/java/cs321/btree/BTreeTest.java b/src/test/java/cs321/btree/BTreeTest.java index 57b650e..957a879 100644 --- a/src/test/java/cs321/btree/BTreeTest.java +++ b/src/test/java/cs321/btree/BTreeTest.java @@ -21,6 +21,8 @@ * * This is to provide more complicated tests that can be modeled * after figures in the CLRS textbook. + * + * @author CS321 Instructors */ public class BTreeTest { @@ -48,36 +50,6 @@ public void cleanUpTests() { deleteTestFile(testFilename); } - // HINT: - // instead of checking all intermediate states of constructing a tree - // you can check the final state of the tree and - // assert that the constructed tree has the expected number of nodes and - // assert that some (or all) of the nodes have the expected values - @Test - public void btreeDegree4Test() - { -// //TODO instantiate and populate a bTree object -// int expectedNumberOfNodes = TBD; -// -// // it is expected that these nodes values will appear in the tree when -// // using a level traversal (i.e., root, then level 1 from left to right, then -// // level 2 from left to right, etc.) -// String[] expectedNodesContent = new String[]{ -// "TBD, TBD", //root content -// "TBD", //first child of root content -// "TBD, TBD, TBD", //second child of root content -// }; -// -// assertEquals(expectedNumberOfNodes, bTree.getNumberOfNodes()); -// for (int indexNode = 0; indexNode < expectedNumberOfNodes; indexNode++) -// { -// // root has indexNode=0, -// // first child of root has indexNode=1, -// // second child of root has indexNode=2, and so on. -// assertEquals(expectedNodesContent[indexNode], bTree.getArrayOfNodeContentsForNodeIndex(indexNode).toString()); -// } - } - /** * Test simple creation of an empty BTree. * An empty BTree has 1 node with no keys and height of 0. From 4a84b7e300e4dfe977924cb6fa2ebb135a108b8c Mon Sep 17 00:00:00 2001 From: andrekeys Date: Fri, 16 Feb 2024 09:36:42 -0700 Subject: [PATCH 2/5] updating database documentation --- README.md | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 1e154eb..5044db4 100644 --- a/README.md +++ b/README.md @@ -317,7 +317,7 @@ length `k`*. The search returns the frequency of occurrence of the query string zero if it is not found). We will also create a SQL database (for a specific length `k`) of subsequences and their -frequency to aid in searching. This can be created from the BTree. +frequency to aid in searching. This can be created from the BTree dump files. ## 4. Design Issues @@ -366,7 +366,7 @@ We will create three programs: - second for **searching in the specified BTree** for subsequences of given length. The search program assumes that the user specified the proper BTree to use depending upon the query length. - third for **searching in the SQL database** for subsequences of specified length. This database - would be created as a by-product of the first program. + would be created by loading btree dump files using the SQLite interface outside our program. The main Java classes should be named `GeneBankCreateBTree`, `GeneBankSearchBTree`, and `GeneBankSearchDatabase`. @@ -413,8 +413,8 @@ have the same length as the DNA subsequences in the B-Tree file. The DNA strings - `[]` is an integer between `100` and `10000` (inclusive) that represents the maximum number of `BTreeNode` objects that can be stored in memory -- `` the path to the SQL database created after BTree creation for a - specific sequence length. The name of the database file should be `xyz.k.db` where the sequence +- `` the path to the SQL database created by loading a dump file using + SQLite's .import command. The name of the database file should be `xyz.k.db` where the sequence length is ``, and the GeneBank file is `xyz.gbk`. The database file should have been created by the `GeneBankCreateBTree` program from the BTree it creates @@ -528,9 +528,7 @@ needs to be specified as well. file. For example, the table below shows the improvement the instructors got on their solution. Note that -your times will be different due to different hardware and differences in the implementation. Also, -we turned off the creation of the database for these timings -- creation of the database will take a -significant amount of time. +your times will be different due to different hardware and differences in the implementation. | gbk file | degree | sequence length | cache | cache size | cache hit rate | run time | | -------- | ------ | --------------- | ----- | ---------- | -------------- | -------- | @@ -551,20 +549,24 @@ of memory), we were able to bring the time to create the BTree down to only 2m19 ## 7. Using a Database -Design a simple database to store the results (sequences and frequencies) from the B-Tree. -We will perform an inorder tree traversal to get the information to store in the database. This -would be done at the end of creating the GeneBank BTree. Then we will create a separate search -program named `GeneBankSearchDatabase` that uses the database instead of the BTree. This is -a common pattern in real life applications, where we may crunch lots of data using a data +Using the dumpfiles from your BTree, load the data into an SQLite database using the +SQLite .import command. See the documentation +(here)[https://sqlite.org/cli.html#importing_files_as_csv_or_other_formats]. +Then we will create a separate search program named `GeneBankSearchDatabase` +that uses the database instead of the BTree. This is a common pattern in real life +applications, where we may crunch lots of data using a data structure and then store the results in a database for ease of access. +Note: Since correct dumpfiles are provided in under results/ GeneBankSearchDatabase +can be started and completed before GeneBankCreateBTree. + ```bash $ ./gradlew createJarGeneBankSearchDatabase $ java -jar build/libs/GeneBankSearchDatabase.jar --database= --queryfile= ``` -We will use the embedded SQLite database for this project. The SQLite database is fully +We will use the embedded SQLite database for searching the database. The SQLite driver is fully contained in a jar file that gradle will automatically pull down for us. See the database example in the section below on how to use SQLite. From 4e37f655ba3f5e909e67989ab9a35a8abc8509b0 Mon Sep 17 00:00:00 2001 From: andrekeys Date: Fri, 16 Feb 2024 09:48:55 -0700 Subject: [PATCH 3/5] updating running on the cloud instructions --- README.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5044db4..589b3a8 100644 --- a/README.md +++ b/README.md @@ -649,11 +649,18 @@ your project. Start off by running tests on your machine. If you do need to run them on `onyx` please only run the smallest test (`test0.gbk`) to avoid overloading the `onyx` server. -## 10. Testing in the Cloud +## 10. Extra Credit: Testing a Large File in the Cloud -We will setup [Amazon AWS](https://aws.amazon.com/) accounts for each student so that you can run -larger tests in the cloud. **Running our tests on AWS is required so we can all get experience -using the cloud.** :cloud: :smiley: +Using the AWS Accounts provided earlier in the course, you can run the +large Y-Chromosome file on a cloud instance. + +Creating a BTree with the large file is very intensive, and will take too long to run +unless your cache implementation is efficient, and your BTree is well-designed. + +To be rewarded the extra credit, capture a screenshot of check-queries.sh completed with a time stamp +and include it with your submission. + +:cloud: :smiley: Please see the [AWS notes](https://docs.google.com/document/d/1v5a0XlzaNyi63TXXKP4BQsPIdJt4Zkxn2lZofVP8qqw/edit?usp=sharing) From e47767fd3d70bf45800e43baf939f13756a3070c Mon Sep 17 00:00:00 2001 From: andrekeys Date: Fri, 16 Feb 2024 09:53:01 -0700 Subject: [PATCH 4/5] updating database documentation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 589b3a8..482e068 100644 --- a/README.md +++ b/README.md @@ -551,7 +551,7 @@ of memory), we were able to bring the time to create the BTree down to only 2m19 Using the dumpfiles from your BTree, load the data into an SQLite database using the SQLite .import command. See the documentation -(here)[https://sqlite.org/cli.html#importing_files_as_csv_or_other_formats]. +[here](https://sqlite.org/cli.html#importing_files_as_csv_or_other_formats). Then we will create a separate search program named `GeneBankSearchDatabase` that uses the database instead of the BTree. This is a common pattern in real life applications, where we may crunch lots of data using a data From c50ac3b70525f247cf3694dd78ea26c232069cd2 Mon Sep 17 00:00:00 2001 From: andrekeys Date: Mon, 26 Feb 2024 12:25:58 -0700 Subject: [PATCH 5/5] fixing some typos --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 482e068..0d2f6a7 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ automatically by gradle or our IDE. The focus of this project is to learn about data structures, while working effectively in a group. In addition, given the small project scope, and the fixed set of requirements that are already defined (and will not need to be elicited with the use of a Product Owner), the team can -customize the Scrum process learned in CS-HU 271 and focus exclusively on: +customize the Scrum process learned in CS-HU 208 and focus exclusively on: - creating tasks - linking commits to task IDs (e.g., `Implements task #123`) - Test-Driven Development and unit testing. The [starter code](#starter-code) already contains a few [sample unit tests](src/test/java/cs321) that can be [run from the command line](#compile-and-run-the-project-from-the-command-line). @@ -557,7 +557,7 @@ that uses the database instead of the BTree. This is a common pattern in real li applications, where we may crunch lots of data using a data structure and then store the results in a database for ease of access. -Note: Since correct dumpfiles are provided in under results/ GeneBankSearchDatabase +Note: Since correct dumpfiles are provided in the results folder, GeneBankSearchDatabase can be started and completed before GeneBankCreateBTree. ```bash @@ -654,10 +654,10 @@ run the smallest test (`test0.gbk`) to avoid overloading the `onyx` server. Using the AWS Accounts provided earlier in the course, you can run the large Y-Chromosome file on a cloud instance. -Creating a BTree with the large file is very intensive, and will take too long to run -unless your cache implementation is efficient, and your BTree is well-designed. +Creating a BTree with the large file is very time intensive. It will take too long to run +unless your cache implementation is efficient and your BTree is well-designed. -To be rewarded the extra credit, capture a screenshot of check-queries.sh completed with a time stamp +To be rewarded with the extra credit, capture a screenshot of check-queries.sh completed and include it with your submission. :cloud: :smiley: