From 5bf6401057e484d065a2391c3b7557a831fa9d47 Mon Sep 17 00:00:00 2001 From: alexsweeten Date: Mon, 22 Apr 2024 12:22:04 -0400 Subject: [PATCH 1/9] bump version --- README.md | 6 +++--- pyproject.toml | 2 +- src/moddotplot/const.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 90b694a..3833217 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Finally, confirm that the installation was installed correctly by running `moddo | | | | (_) | (_| | | |__| | (_) | |_ | | | | (_) | |_ |_| |_|\___/ \__,_| |_____/ \___/ \__| |_| |_|\___/ \__| -v0.8.1 +v0.8.2 usage: moddotplot [-h] {interactive,static} ... @@ -234,7 +234,7 @@ $ moddotplot interactive -f sequences/Chr1_cen.fa | | | | (_) | (_| | | |__| | (_) | |_ | | | | (_) | |_ |_| |_|\___/ \__,_| |_____/ \___/ \__| |_| |_|\___/ \__| -v0.8.1 +v0.8.2 Running ModDotPlot in interactive mode @@ -322,7 +322,7 @@ $ moddotplot static -c config/config.json | | | | (_) | (_| | | |__| | (_) | |_ | | | | (_) | |_ |_| |_|\___/ \__,_| |_____/ \___/ \__| |_| |_|\___/ \__| -v0.8.1 +v0.8.2 Running ModDotPlot in static mode diff --git a/pyproject.toml b/pyproject.toml index 4e74f3c..76d5100 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ModDotPlot" -version = "0.8.1" +version = "0.8.2" requires-python = ">= 3.7" dependencies = [ "pysam", diff --git a/src/moddotplot/const.py b/src/moddotplot/const.py index 8a15eea..d36002c 100644 --- a/src/moddotplot/const.py +++ b/src/moddotplot/const.py @@ -1,4 +1,4 @@ -VERSION = "0.8.1" +VERSION = "0.8.2" COLS = [ "#query_name", "query_start", From 2bdce37ac1f3b070bc66bb5583c3864ab4cf8c66 Mon Sep 17 00:00:00 2001 From: alexsweeten Date: Mon, 22 Apr 2024 12:23:05 -0400 Subject: [PATCH 2/9] remove chr1 centromere index --- sequences/chr1_cen.fa.fai | 1 - 1 file changed, 1 deletion(-) delete mode 100644 sequences/chr1_cen.fa.fai diff --git a/sequences/chr1_cen.fa.fai b/sequences/chr1_cen.fa.fai deleted file mode 100644 index 97023b3..0000000 --- a/sequences/chr1_cen.fa.fai +++ /dev/null @@ -1 +0,0 @@ -chr1:14M-18M 4000000 14 60 61 From 55687a669d2dbfdaeb172551ab109b6aaa2a598c Mon Sep 17 00:00:00 2001 From: alexsweeten Date: Mon, 22 Apr 2024 12:57:06 -0400 Subject: [PATCH 3/9] Fix sorting bug with multifasta files --- src/moddotplot/moddotplot.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/moddotplot/moddotplot.py b/src/moddotplot/moddotplot.py index 8e70fa4..9708904 100644 --- a/src/moddotplot/moddotplot.py +++ b/src/moddotplot/moddotplot.py @@ -502,7 +502,7 @@ def main(): isValidFasta(i) headers = getInputHeaders(i) if len(headers) > 1: - print(f"File {i} contains multiple fasta entries: \n") + print(f"File {i} contains multiple fasta entries. \n") counter = 1 for j in headers: counter += 1 @@ -799,8 +799,8 @@ def main(): # -----------SETUP STATIC MODE----------- elif args.command == "static": # -----------SET SPARSITY VALUE----------- + #TODO: this is not sorting correctly sequences = list(zip(seq_list, k_list)) - sequences.sort(key=lambda x: len(x[1]), reverse=True) # Create output directory, if doesn't exist: if (args.output_dir) and not os.path.exists(args.output_dir): @@ -816,10 +816,12 @@ def main(): res = math.ceil(seq_length / args.window) else: win = math.ceil(seq_length / args.resolution) + if win < args.modimizer: - raise ValueError( + args.modimizer = win + '''raise ValueError( "Window size must be greater than or equal to the modimizer sketch size" - ) + )''' seq_sparsity = round(win / args.modimizer) if seq_sparsity <= args.modimizer: @@ -845,7 +847,7 @@ def main(): # print(f"\tSparsity value s: {seq_sparsity}\n") print(f"\tSequence length n: {len(k_list[i]) + args.kmer - 1}\n") print(f"\tWindow size w: {win}\n") - print(f"\tModimizer sketch value: {expectation}\n") + print(f"\tModimizer sketch size: {expectation}\n") print(f"\tPlot Resolution r: {res}\n") self_mat = createSelfMatrix( seq_length, @@ -905,9 +907,7 @@ def main(): else: win = math.ceil(len(sequences[0][1]) / args.resolution) if win < args.modimizer: - raise ValueError( - "Window size must be greater than or equal to the modimizer sketch size" - ) + args.modimizer = win seq_sparsity = round(win / args.modimizer) if seq_sparsity <= args.modimizer: @@ -935,7 +935,7 @@ def main(): f"\tSequence length {sequences[j][0]}: {smaller_length + args.kmer - 1}\n" ) print(f"\tWindow size w: {win}\n") - print(f"\tModimizer sketch value: {expectation}\n") + print(f"\tModimizer sketch size: {expectation}\n") print(f"\tPlot Resolution r: {res}\n") pair_mat = createPairwiseMatrix( From 479197197d2053eda9dad3480750284c79dc169e Mon Sep 17 00:00:00 2001 From: alexsweeten Date: Mon, 22 Apr 2024 13:12:13 -0400 Subject: [PATCH 4/9] Format with black --- src/moddotplot/__main__.py | 4 ++-- src/moddotplot/estimate_identity.py | 1 - src/moddotplot/interactive.py | 29 +++++++++++++++-------- src/moddotplot/moddotplot.py | 15 ++++++------ src/moddotplot/parse_fasta.py | 36 +++++++++++++++++++---------- src/moddotplot/static_plots.py | 20 +++++++++------- 6 files changed, 64 insertions(+), 41 deletions(-) diff --git a/src/moddotplot/__main__.py b/src/moddotplot/__main__.py index b7d2d39..31b5895 100644 --- a/src/moddotplot/__main__.py +++ b/src/moddotplot/__main__.py @@ -6,6 +6,6 @@ import setproctitle # Set the process title to a custom name -setproctitle.setproctitle('ModDotPlot') +setproctitle.setproctitle("ModDotPlot") -sys.exit(main()) \ No newline at end of file +sys.exit(main()) diff --git a/src/moddotplot/estimate_identity.py b/src/moddotplot/estimate_identity.py index 5a8e82e..0c38c62 100644 --- a/src/moddotplot/estimate_identity.py +++ b/src/moddotplot/estimate_identity.py @@ -510,7 +510,6 @@ def findValueInRange(integer: int, range_dict: dict) -> int: if key[0] >= integer >= key[1]: return value return highest_value - def setZoomLevels(axis_length, sparsity_layers): diff --git a/src/moddotplot/interactive.py b/src/moddotplot/interactive.py index 3de3fc7..660c84d 100644 --- a/src/moddotplot/interactive.py +++ b/src/moddotplot/interactive.py @@ -76,6 +76,7 @@ def run_dash(matrices, metadata, axes, sparsity, identity, port_number, output_d titles = [] for i in range(len(metadata)): titles.append(metadata[i]["title"]) + # Get zooming thresholds, adjust sparsity respectively. def halving_sequence(size, start): sequence = [start] @@ -83,23 +84,31 @@ def halving_sequence(size, start): start /= 2 sequence.append(start) return sequence - #print(current_metadata) - mod_thresholds_list = halving_sequence(len(current_metadata["sparsities"]), current_metadata["x_size"]) - #print(mod_thresholds_list) - - #print(current_metadata["min_window_size"]* current_metadata["resolution"]) - #print(current_metadata["max_window_size"]) - numo = round(math.log2(current_metadata['max_window_size']/current_metadata['min_window_size']) + 1) - #print(numo) + + # print(current_metadata) + mod_thresholds_list = halving_sequence( + len(current_metadata["sparsities"]), current_metadata["x_size"] + ) + # print(mod_thresholds_list) + + # print(current_metadata["min_window_size"]* current_metadata["resolution"]) + # print(current_metadata["max_window_size"]) + numo = round( + math.log2( + current_metadata["max_window_size"] / current_metadata["min_window_size"] + ) + + 1 + ) + # print(numo) important = generateDictionaryFromList(mod_thresholds_list) - #print(f"this is imprtant: {important}") + # print(f"this is imprtant: {important}") main_level = image_pyramid[0] main_x_axis = axes[0][0] main_y_axis = axes[0][1] main_x_axis_np = np.array(main_x_axis) - #TODO: modify value here + # TODO: modify value here main_x_axis_np += 3000 # Modify text so that hover shows interval format diff --git a/src/moddotplot/moddotplot.py b/src/moddotplot/moddotplot.py index 9708904..08a16b5 100644 --- a/src/moddotplot/moddotplot.py +++ b/src/moddotplot/moddotplot.py @@ -541,9 +541,7 @@ def main(): max_window_size = math.ceil(hgi / args.resolution) # If only sequence is too small, throw an error. if max_window_size < 10: - print( - f"Error: sequence too small for analysis.\n" - ) + print(f"Error: sequence too small for analysis.\n") print( f"ModDotPlot requires a minimum window size of 10. Sequences less than 10Kbp will not work with ModDotPlot under normal resolution. We recommend rerunning ModDotPlot with --r {math.ceil(hgi / 10)}.\n" ) @@ -772,10 +770,11 @@ def main(): pickle.dump(metadata, f) # Check if no plot arg is used if args.no_plot: - print(f"Saved matrices to {folder_path}. Thank you for using ModDotPlot!\n") + print( + f"Saved matrices to {folder_path}. Thank you for using ModDotPlot!\n" + ) sys.exit(0) - # Before running dash, change into intervals... axes = [] for matrices_set, meta in zip(matrices, metadata): @@ -799,7 +798,7 @@ def main(): # -----------SETUP STATIC MODE----------- elif args.command == "static": # -----------SET SPARSITY VALUE----------- - #TODO: this is not sorting correctly + # TODO: this is not sorting correctly sequences = list(zip(seq_list, k_list)) # Create output directory, if doesn't exist: @@ -819,9 +818,9 @@ def main(): if win < args.modimizer: args.modimizer = win - '''raise ValueError( + """raise ValueError( "Window size must be greater than or equal to the modimizer sketch size" - )''' + )""" seq_sparsity = round(win / args.modimizer) if seq_sparsity <= args.modimizer: diff --git a/src/moddotplot/parse_fasta.py b/src/moddotplot/parse_fasta.py index 617bfc6..f271862 100644 --- a/src/moddotplot/parse_fasta.py +++ b/src/moddotplot/parse_fasta.py @@ -10,27 +10,37 @@ tab_b = bytes.maketrans(b"ACTG", b"TGAC") + def generateKmersFromFasta(seq: Sequence[str], k: int, quiet: bool) -> Iterable[int]: n = len(seq) if not quiet: progress_thresholds = round(n / 77) - printProgressBar(0, n - k + 1, prefix='Progress:', suffix='Complete', length=40) + printProgressBar(0, n - k + 1, prefix="Progress:", suffix="Complete", length=40) for i in range(n - k + 1): if not quiet: if i % progress_thresholds == 0: - printProgressBar(i, n - k + 1, prefix='Progress:', suffix='Complete', length=40) + printProgressBar( + i, n - k + 1, prefix="Progress:", suffix="Complete", length=40 + ) if i == n - k: - printProgressBar(n - k + 1, n - k + 1, prefix='Progress:', suffix='Completed', length=40) - - kmer = seq[i:i + k] + printProgressBar( + n - k + 1, + n - k + 1, + prefix="Progress:", + suffix="Completed", + length=40, + ) + + kmer = seq[i : i + k] fh = mmh3.hash(kmer) # Calculate reverse complement hash directly without the need for translation rc = mmh3.hash(kmer[::-1].translate(tab_b)) - + yield fh if fh < rc else rc + def isValidFasta(file_path): try: with open(file_path, "r") as file: @@ -52,6 +62,7 @@ def isValidFasta(file_path): print(f"An error occurred: {str(e)}") sys.exit(6) + def extractFiles(folder_path): # Check to see at least one compressed numpy matrix, and one metadata pickle are included metadata = [] @@ -61,15 +72,15 @@ def extractFiles(folder_path): file_path = os.path.join(folder_path, filename) # Full path to the file if filename.endswith(".pkl"): with open(file_path, "rb") as f: - metadata = pickle.load(f) # Append loaded data to the metadata list + metadata = pickle.load(f) # Append loaded data to the metadata list for filename in os.listdir(folder_path): file_path = os.path.join(folder_path, filename) if filename.endswith(".npz"): - pattern = rf'_(\d+)\.npz' # Using f-string to include the value of i in the regex pattern + pattern = rf"_(\d+)\.npz" # Using f-string to include the value of i in the regex pattern tmp2 = re.split(pattern, filename, maxsplit=1) - ff = np.load(file_path,allow_pickle=True) - tmp.append((tmp2[0],tmp2[1], ff)) + ff = np.load(file_path, allow_pickle=True) + tmp.append((tmp2[0], tmp2[1], ff)) sorted_list = sorted(tmp, key=lambda x: (x[0], x[1])) unique_lists = {} @@ -84,7 +95,9 @@ def extractFiles(folder_path): # Convert dictionary values to lists result_lists = list(unique_lists.values()) - sorted_result_lists = [lst for title in metadata for lst in result_lists if lst[0][0] == title['title']] + sorted_result_lists = [ + lst for title in metadata for lst in result_lists if lst[0][0] == title["title"] + ] for unique_list in sorted_result_lists: matrices.append([]) for val in unique_list: @@ -92,7 +105,6 @@ def extractFiles(folder_path): return matrices, metadata - def printProgressBar( iteration, total, diff --git a/src/moddotplot/static_plots.py b/src/moddotplot/static_plots.py index 3c3e669..67e84d0 100644 --- a/src/moddotplot/static_plots.py +++ b/src/moddotplot/static_plots.py @@ -208,7 +208,7 @@ def make_dot(sdf, title_name, palette, palette_orientation, colors): axis_ticks_major=element_line(), title=element_text( family=["DejaVu Sans"], # Change title font family - ) + ), ) + scale_x_continuous(labels=make_scale, limits=[0, max_val]) + scale_y_continuous(labels=make_scale, limits=[0, max_val]) @@ -218,10 +218,11 @@ def make_dot(sdf, title_name, palette, palette_orientation, colors): ) # Adjust x-axis label size - #p += theme(axis_title_x=element_text()) + # p += theme(axis_title_x=element_text()) return p + def make_tri(sdf, title_name, palette, palette_orientation, colors): hexcodes = [] new_hexcodes = [] @@ -274,7 +275,7 @@ def make_tri(sdf, title_name, palette, palette_orientation, colors): axis_ticks_major=element_line(), title=element_text( family=["DejaVu Sans"], # Change title font family - ) + ), ) + scale_x_continuous(labels=make_scale, limits=[0, max_val]) + scale_y_continuous(labels=make_scale, limits=[0, max_val]) @@ -288,6 +289,7 @@ def make_tri(sdf, title_name, palette, palette_orientation, colors): return p + def make_hist(sdf, palette, palette_orientation, custom_colors, custom_breakpoints): hexcodes = [] new_hexcodes = [] @@ -329,9 +331,7 @@ def make_hist(sdf, palette, palette_orientation, custom_colors, custom_breakpoin + scale_color_cmap(cmap_name="plasma") + scale_fill_manual(new_hexcodes) + theme_light() - + theme( - text=element_text(family=["DejaVu Sans"]) - ) + + theme(text=element_text(family=["DejaVu Sans"])) + theme(legend_position="none") + coord_cartesian(xlim=(bot, 100)) + xlab("% Identity Estimate") @@ -378,7 +378,9 @@ def create_plots( if is_pairwise: print(width) - heatmap = make_dot(sdf, plot_filename, palette, palette_orientation, custom_colors) + heatmap = make_dot( + sdf, plot_filename, palette, palette_orientation, custom_colors + ) print(f"Creating plots and saving to {plot_filename}...\n") ggsave( heatmap, @@ -425,7 +427,9 @@ def create_plots( # Self-identity plots: Output _TRI, _FULL, and _HIST else: print(width) - tri_plot = make_tri(sdf, plot_filename, palette, palette_orientation, custom_colors) + tri_plot = make_tri( + sdf, plot_filename, palette, palette_orientation, custom_colors + ) full_plot = make_dot( check_st_en_equality(sdf), plot_filename, From 5918b06df2e58c3f4a80ff962ad88c0bf40381b9 Mon Sep 17 00:00:00 2001 From: Alex Sweeten Date: Mon, 22 Apr 2024 13:15:35 -0400 Subject: [PATCH 5/9] Create black.yml --- .github/workflows/black.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/black.yml diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000..e302e5a --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,14 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: psf/black@stable + with: + options: "--check --verbose" + src: "./src" + use_pyproject: true From 824299738a2092d0546cd6dd13144326c9c2ed24 Mon Sep 17 00:00:00 2001 From: alexsweeten Date: Mon, 22 Apr 2024 13:17:51 -0400 Subject: [PATCH 6/9] modify black workflow for dev branch --- .github/workflows/black.yml | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index e302e5a..b1287d3 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -1,14 +1,19 @@ -name: Lint +name: black -on: [push, pull_request] +# Controls when the action will run. +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + + workflow_dispatch: jobs: - lint: + black: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v2 - uses: psf/black@stable with: - options: "--check --verbose" - src: "./src" - use_pyproject: true + options: ". --check --verbose" \ No newline at end of file From 2cfe3a12dd751462aba5fd3f3b40c8a7ebf1c969 Mon Sep 17 00:00:00 2001 From: alexsweeten Date: Mon, 22 Apr 2024 13:22:32 -0400 Subject: [PATCH 7/9] black lint interactive.py --- src/moddotplot/interactive.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/moddotplot/interactive.py b/src/moddotplot/interactive.py index 660c84d..d88dca7 100644 --- a/src/moddotplot/interactive.py +++ b/src/moddotplot/interactive.py @@ -2,8 +2,6 @@ from moddotplot.estimate_identity import ( getInteractiveColor, getMatchingColors, - verifyModimizers, - setZoomLevels, makeDifferencesEqual, generateDictionaryFromList, findValueInRange, From c153a7d623b712dd68403b86bf41c0d47566d7c3 Mon Sep 17 00:00:00 2001 From: alexsweeten Date: Mon, 22 Apr 2024 13:42:07 -0400 Subject: [PATCH 8/9] Add citation :D --- README.md | 4 +++- src/moddotplot/interactive.py | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3833217..162ba35 100644 --- a/README.md +++ b/README.md @@ -387,4 +387,6 @@ For bug reports or general usage questions, please raise a GitHub issue, or emai ## Cite -Publication in progress! (almost there :D) +Alexander P. Sweeten, Michael C. Schatz, Adam M. Phillippy, ModDotPlot - Rapid and interactive visualization of complex repeats +bioRxiv 2024.04.15.589623; doi: https://doi.org/10.1101/2024.04.15.589623 + diff --git a/src/moddotplot/interactive.py b/src/moddotplot/interactive.py index d88dca7..5f8dfde 100644 --- a/src/moddotplot/interactive.py +++ b/src/moddotplot/interactive.py @@ -268,9 +268,9 @@ def halving_sequence(size, start): {"label": f"{title}", "value": f"{title}"} for title in titles # Iterate over each title ], - value=titles[0] - if titles - else None, # Set default value based on the length of matrices + value=( + titles[0] if titles else None + ), # Set default value based on the length of matrices clearable=False, # Prevent dropdown from clearing values, style={"width": "300px"}, ), From 992572b132fa85ec22890b3b75832b38074b16ee Mon Sep 17 00:00:00 2001 From: Alex Sweeten Date: Mon, 22 Apr 2024 13:43:45 -0400 Subject: [PATCH 9/9] Create CITATION.cff --- CITATION.cff | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..0724d6a --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,14 @@ + +@article{Sweeten2024.04.15.589623, + abstract = {Motivation A common method for analyzing genomic repeats is to produce a sequence similarity matrix visualized via a dot plot. Innovative approaches such as StainedGlass have improved upon this classic visualization by rendering dot plots as a heatmap of sequence identity, enabling researchers to better visualize multi-megabase tandem repeat arrays within centromeres and other heterochromatic regions of the genome. However, computing the similarity estimates for heatmaps requires high computational overhead and can suffer from decreasing accuracy. Results In this work we introduce ModDotPlot, an interactive and alignment-free dot plot viewer. By approximating average nucleotide identity via a k-mer-based containment index, ModDotPlot produces accurate plots orders of magnitude faster than StainedGlass. We accomplish this through the use of a hierarchical modimizer scheme that can visualize the full 128 Mbp genome of Arabidopsis thaliana in under 5 minutes on a laptop. ModDotPlot is implemented in Python with a graphical user interface supporting real-time interactive navigation of entire chromosomes. Availability and Implementation ModDotPlot is available at https://github.com/marbl/ModDotPlot.Competing Interest StatementThe authors have declared no competing interest.}, + author = {Alexander P. Sweeten and Michael C. Schatz and Adam M. Phillippy}, + doi = {10.1101/2024.04.15.589623}, + elocation-id = {2024.04.15.589623}, + eprint = {https://www.biorxiv.org/content/early/2024/04/19/2024.04.15.589623.full.pdf}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + title = {ModDotPlot - Rapid and interactive visualization of complex repeats}, + url = {https://www.biorxiv.org/content/early/2024/04/19/2024.04.15.589623}, + year = {2024}, + bdsk-url-1 = {https://www.biorxiv.org/content/early/2024/04/19/2024.04.15.589623}, + bdsk-url-2 = {https://doi.org/10.1101/2024.04.15.589623}}