From 24267e93db942a679b903a2dce76288ccf198b52 Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Wed, 28 Aug 2024 16:24:28 +0200 Subject: [PATCH] [CRON] Update publications --- _data/publications.yml | 667 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 638 insertions(+), 29 deletions(-) diff --git a/_data/publications.yml b/_data/publications.yml index 2f2fc36..aa14b4a 100644 --- a/_data/publications.yml +++ b/_data/publications.yml @@ -13686,32 +13686,32 @@ type: thesis uri: http://publications.imp.fu-berlin.de/id/eprint/2855 userid: 132 -- abstract: "Motivation: \r\nDeep learning has moved to the forefront of tandem mass\ +- abstract: "Motivation \r\nDeep learning has moved to the forefront of tandem mass\ \ spectrometry-driven proteomics and authentic prediction for peptide fragmentation\ \ is more feasible than ever. Still, at this point spectral prediction is mainly\ - \ used to validate database search results or used for confined search spaces.\ - \ Fully predicted spectral libraries have not yet been efficiently adapted to\ - \ large search space problems that often occur in metaproteomics or proteogenomics.\r\ - \n\r\nResults: \r\nIn this study, we showcase a workflow that uses Prosit for\ - \ spectral library predictions on two common metaproteomes and implement an indexing\ - \ and search algorithm, Mistle, to efficiently identify experimental mass spectra\ + \ used to validate database search results or for confined search spaces. Fully\ + \ predicted spectral libraries have not yet been efficiently adapted to large\ + \ search space problems that often occur in metaproteomics or proteogenomics.\r\ + \n\r\nResults\r\nIn this study, we showcase a workflow that uses Prosit for spectral\ + \ library predictions on two common metaproteomes and implement an indexing and\ + \ search algorithm, Mistle, to efficiently identify experimental mass spectra\ \ within the library. Hence, the workflow emulates a classic protein sequence\ \ database search with protein digestion but builds a searchable index from spectral\ \ predictions as an in-between step. We compare Mistle to popular search engines,\ \ both on a spectral and database search level, and provide evidence that this\ \ approach is more accurate than a database search using MSFragger. Mistle outperforms\ \ other spectral library search engines in terms of run time and proves to be\ - \ extremely memory efficient with an 8 to 22-fold decrease in RAM usage. This\ + \ extremely memory efficient with a 4- to 22-fold decrease in RAM usage. This\ \ makes Mistle universally applicable to large search spaces, e.g. covering comprehensive\ - \ sequence databases of diverse microbiomes.\r\n\r\nAvailability: \r\nMistle is\ - \ freely available on GitHub at https://github.com/BAMeScience/Mistle." - bibtex: "@article{fu_mi_publications2946,\n abstract = {Motivation: \nDeep learning\ + \ sequence databases of diverse microbiomes.\r\n\r\nAvailability and implementation\r\ + \nMistle is freely available on GitHub at https://github.com/BAMeScience/Mistle." + bibtex: "@article{fu_mi_publications2946,\n abstract = {Motivation \nDeep learning\ \ has moved to the forefront of tandem mass spectrometry-driven proteomics and\ \ authentic prediction for peptide fragmentation is more feasible than ever. Still,\ \ at this point spectral prediction is mainly used to validate database search\ - \ results or used for confined search spaces. Fully predicted spectral libraries\ - \ have not yet been efficiently adapted to large search space problems that often\ - \ occur in metaproteomics or proteogenomics.\n\nResults: \nIn this study, we showcase\ + \ results or for confined search spaces. Fully predicted spectral libraries have\ + \ not yet been efficiently adapted to large search space problems that often occur\ + \ in metaproteomics or proteogenomics.\n\nResults\nIn this study, we showcase\ \ a workflow that uses Prosit for spectral library predictions on two common metaproteomes\ \ and implement an indexing and search algorithm, Mistle, to efficiently identify\ \ experimental mass spectra within the library. Hence, the workflow emulates a\ @@ -13720,16 +13720,17 @@ \ Mistle to popular search engines, both on a spectral and database search level,\ \ and provide evidence that this approach is more accurate than a database search\ \ using MSFragger. Mistle outperforms other spectral library search engines in\ - \ terms of run time and proves to be extremely memory efficient with an 8 to 22-fold\ + \ terms of run time and proves to be extremely memory efficient with a 4- to 22-fold\ \ decrease in RAM usage. This makes Mistle universally applicable to large search\ \ spaces, e.g. covering comprehensive sequence databases of diverse microbiomes.\n\ - \nAvailability: \nMistle is freely available on GitHub at https://github.com/BAMeScience/Mistle.},\n\ + \nAvailability and implementation\nMistle is freely available on GitHub at https://github.com/BAMeScience/Mistle.},\n\ \ author = {Yannek Nowatzky and Philipp Benner and Knut Reinert and Thilo Muth},\n\ \ booktitle = {Mistle: bringing spectral library predictions to metaproteomics\ - \ with an efficient search index},\n journal = {bioRxiv},\n month = {September},\n\ - \ title = {Mistle: bringing spectral library predictions to metaproteomics with\ - \ an efficient search index},\n url = {http://publications.imp.fu-berlin.de/2946/},\n\ - \ year = {2022}\n}\n" + \ with an efficient search index},\n journal = {Bioinformatics},\n month = {September},\n\ + \ number = {6},\n publisher = {Oxford University Press},\n title = {Mistle: bringing\ + \ spectral library predictions to metaproteomics with an efficient search index},\n\ + \ url = {http://publications.imp.fu-berlin.de/2946/},\n volume = {39},\n year\ + \ = {2023}\n}\n" book_title: 'Mistle: bringing spectral library predictions to metaproteomics with an efficient search index' creators: @@ -13753,8 +13754,8 @@ given: Thilo honourific: null lineage: null - date: '2022-09-12' - date_type: submitted + date: '2023-09-09' + date_type: published datestamp: '2023-04-19 12:51:18' dir: disk0/00/00/29/46 divisions: @@ -13762,16 +13763,19 @@ eprint_status: archive eprintid: 2946 full_text_status: none - id_number: doi:10.1101/2022.09.09.507252 + id_number: doi:10.1093/bioinformatics/btad376 ispublished: pub + issn: 1367-4811 key: fu_mi_publications2946 - lastmod: '2023-04-19 12:51:18' + lastmod: '2024-04-18 10:23:45' metadata_visibility: show - official_url: https://doi.org/10.1101/2022.09.09.507252 - publication: bioRxiv - refereed: 'FALSE' - rev_number: 5 - status_changed: '2023-04-19 12:51:18' + number: 6 + official_url: https://doi.org/10.1093/bioinformatics/btad376 + publication: Bioinformatics + publisher: Oxford University Press + refereed: 'TRUE' + rev_number: 8 + status_changed: '2024-04-18 10:23:45' subjects: - G400 title: 'Mistle: bringing spectral library predictions to metaproteomics with an @@ -13779,6 +13783,7 @@ type: article uri: http://publications.imp.fu-berlin.de/id/eprint/2946 userid: 132 + volume: 39 - abstract: 'Alignment is the cornerstone of many long-read pipelines and plays an essential role in resolving structural variants (SVs). However, forced alignments of SVs embedded in long reads, inflexibility of integrating novel SVs models and @@ -14087,6 +14092,610 @@ uri: http://publications.imp.fu-berlin.de/id/eprint/2949 userid: 132 volume: 12 +- abstract: "Motivation\r\nThe minimizer concept is a data structure for sequence\ + \ sketching. The standard canonical minimizer selects a subset of k-mers from\ + \ the given DNA sequence by comparing the forward and reverse k-mers in a window\ + \ simultaneously according to a predefined selection scheme. It is widely employed\ + \ by sequence analysis such as read mapping and assembly. k-mer density, k-mer\ + \ repetitiveness (e.g. k-mer bias), and computational efficiency are three critical\ + \ measurements for minimizer selection schemes. However, there exist trade-offs\ + \ between kinds of minimizer variants. Generic, effective, and efficient are always\ + \ the requirements for high-performance minimizer algorithms.\r\n\r\nResults\r\ + \nWe propose a simple minimizer operator as a refinement of the standard canonical\ + \ minimizer. It takes only a few operations to compute. However, it can improve\ + \ the k-mer repetitiveness, especially for the lexicographic order. It applies\ + \ to other selection schemes of total orders (e.g. random orders). Moreover, it\ + \ is computationally efficient and the density is close to that of the standard\ + \ minimizer. The refined minimizer may benefit high-performance applications like\ + \ binning and read mapping.\r\n\r\nAvailability and implementation\r\nThe source\ + \ code of the benchmark in this work is available at the github repository https://github.com/xp3i4/mini_benchmark" + bibtex: "@article{fu_mi_publications3139,\n abstract = {Motivation\nThe minimizer\ + \ concept is a data structure for sequence sketching. The standard canonical minimizer\ + \ selects a subset of k-mers from the given DNA sequence by comparing the forward\ + \ and reverse k-mers in a window simultaneously according to a predefined selection\ + \ scheme. It is widely employed by sequence analysis such as read mapping and\ + \ assembly. k-mer density, k-mer repetitiveness (e.g. k-mer bias), and computational\ + \ efficiency are three critical measurements for minimizer selection schemes.\ + \ However, there exist trade-offs between kinds of minimizer variants. Generic,\ + \ effective, and efficient are always the requirements for high-performance minimizer\ + \ algorithms.\n\nResults\nWe propose a simple minimizer operator as a refinement\ + \ of the standard canonical minimizer. It takes only a few operations to compute.\ + \ However, it can improve the k-mer repetitiveness, especially for the lexicographic\ + \ order. It applies to other selection schemes of total orders (e.g. random orders).\ + \ Moreover, it is computationally efficient and the density is close to that of\ + \ the standard minimizer. The refined minimizer may benefit high-performance applications\ + \ like binning and read mapping.\n\nAvailability and implementation\nThe source\ + \ code of the benchmark in this work is available at the github repository https://github.com/xp3i4/mini\\\ + _benchmark},\n author = {Chenxu Pan and Knut Reinert and Alfonso Valencia},\n\ + \ journal = {Bioinformatics},\n month = {January},\n number = {2},\n publisher\ + \ = {Oxford University Press},\n title = {A simple refined DNA minimizer operator\ + \ enables 2-fold faster computation},\n url = {http://publications.imp.fu-berlin.de/3139/},\n\ + \ volume = {40},\n year = {2024}\n}\n" + creators: + - name: + family: Pan + given: Chenxu + honourific: null + lineage: null + - name: + family: Reinert + given: Knut + honourific: null + lineage: null + - name: + family: Valencia + given: Alfonso + honourific: null + lineage: null + date: '2024-01-25' + date_type: published + datestamp: '2024-04-18 10:31:29' + dir: disk0/00/00/31/39 + divisions: + - group_algbioinf + eprint_status: archive + eprintid: 3139 + full_text_status: none + id_number: doi:10.1093/bioinformatics/btae045 + ispublished: pub + issn: 1367-4811 + key: fu_mi_publications3139 + lastmod: '2024-04-18 11:45:45' + metadata_visibility: show + number: 2 + official_url: https://doi.org/10.1093/bioinformatics/btae045 + publication: Bioinformatics + publisher: Oxford University Press + refereed: 'TRUE' + rev_number: 8 + status_changed: '2024-04-18 11:45:45' + subjects: + - G400 + title: A simple refined DNA minimizer operator enables 2-fold faster computation + type: article + uri: http://publications.imp.fu-berlin.de/id/eprint/3139 + userid: 132 + volume: 40 +- abstract: Scientific communities are motivated to schedule their large-scale data + analysis workflows in heterogeneous cluster environments because of privacy and + financial issues. In such environments containing considerably diverse resources, + efficient resource allocation approaches are essential for reaching high performance. + Accordingly, this research addresses the scheduling problem of workflows with + bag-of-task form to minimize total runtime (makespan). To this aim, we develop + a mixed-integer linear programming model (MILP). The proposed model contains binary + decision variables determining which tasks should be assigned to which nodes. + Also, it contains linear constraints to fulfill the tasks requirements such as + memory and scheduling policy. Comparative results show that our approach outperforms + related approaches in most cases. As part of the post-optimality analysis, some + secondary preferences are imposed on the proposed model to obtain the most preferred + optimal solution. We analyze the relaxation of the makespan in the hope of significantly + reducing the number of consumed nodes. + bibtex: "@article{fu_mi_publications3140,\n abstract = {Scientific communities are\ + \ motivated to schedule their large-scale data analysis workflows in heterogeneous\ + \ cluster environments because of privacy and financial issues. In such environments\ + \ containing considerably diverse resources, efficient resource allocation approaches\ + \ are essential for reaching high performance. Accordingly, this research addresses\ + \ the scheduling problem of workflows with bag-of-task form to minimize total\ + \ runtime (makespan). To this aim, we develop a mixed-integer linear programming\ + \ model (MILP). The proposed model contains binary decision variables determining\ + \ which tasks should be assigned to which nodes. Also, it contains linear constraints\ + \ to fulfill the tasks requirements such as memory and scheduling policy. Comparative\ + \ results show that our approach outperforms related approaches in most cases.\ + \ As part of the post-optimality analysis, some secondary preferences are imposed\ + \ on the proposed model to obtain the most preferred optimal solution. We analyze\ + \ the relaxation of the makespan in the hope of significantly reducing the number\ + \ of consumed nodes.},\n author = {Somayeh Mohammadi and Latif PourKarimi and\ + \ Felix Droop and Ninon De Mecquenem and Ulf Leser and Knut Reinert},\n journal\ + \ = {The Journal of Supercomputing},\n month = {March},\n number = {17},\n pages\ + \ = {19019--19048},\n publisher = {Springer},\n title = {A mathematical programming\ + \ approach for resource allocation of data analysis workflows on heterogeneous\ + \ clusters},\n url = {http://publications.imp.fu-berlin.de/3140/},\n volume =\ + \ {79},\n year = {2023}\n}\n" + creators: + - name: + family: Mohammadi + given: Somayeh + honourific: null + lineage: null + - name: + family: PourKarimi + given: Latif + honourific: null + lineage: null + - name: + family: Droop + given: Felix + honourific: null + lineage: null + - name: + family: De Mecquenem + given: Ninon + honourific: null + lineage: null + - name: + family: Leser + given: Ulf + honourific: null + lineage: null + - name: + family: Reinert + given: Knut + honourific: null + lineage: null + date: '2023-03-23' + date_type: published + datestamp: '2024-04-18 10:36:11' + dir: disk0/00/00/31/40 + divisions: + - group_algbioinf + eprint_status: archive + eprintid: 3140 + full_text_status: none + id_number: doi:10.1007/s11227-023-05325-w + ispublished: pub + issn: 0920-8542 + key: fu_mi_publications3140 + lastmod: '2024-04-18 10:36:11' + metadata_visibility: show + number: 17 + official_url: https://doi.org/10.1007/s11227-023-05325-w + pagerange: 19019-19048 + publication: The Journal of Supercomputing + publisher: Springer + refereed: 'TRUE' + rev_number: 5 + status_changed: '2024-04-18 10:36:11' + subjects: + - G400 + title: A mathematical programming approach for resource allocation of data analysis + workflows on heterogeneous clusters + type: article + uri: http://publications.imp.fu-berlin.de/id/eprint/3140 + userid: 132 + volume: 79 +- abstract: Circular extrachromosomal DNA (ecDNA) is a form of oncogene amplification + found across cancer types and associated with poor outcome in patients. ecDNA + can be structurally complex and contain rearranged DNA sequences derived from + multiple chromosome locations. As the structure of ecDNA can impact oncogene regulation + and may indicate mechanisms of its formation, disentangling it at high resolution + from sequencing data is essential. Even though methods have been developed to + identify and reconstruct ecDNA in cancer genome sequencing, it remains challenging + to resolve complex ecDNA structures, in particular amplicons with shared genomic + footprints. We here introduce Decoil, a computational method which combines a + breakpoint-graph approach with LASSO regression to reconstruct complex ecDNA and + deconvolve co-occurring ecDNA elements with overlapping genomic footprints from + long-read nanopore sequencing. Decoil outperforms de-novo assembly methods in + simulated long-read sequencing data for both, simple and complex ecDNAs. Applying + Decoil on whole genome sequencing data uncovered different ecDNA topologies and + explored ecDNA structure heterogeneity in neuroblastoma tumors and cell lines, + indicating that this method may improve ecDNA structural analyzes in cancer. + bibtex: "@article{fu_mi_publications3141,\n abstract = {Circular extrachromosomal\ + \ DNA (ecDNA) is a form of oncogene amplification found across cancer types and\ + \ associated with poor outcome in patients. ecDNA can be structurally complex\ + \ and contain rearranged DNA sequences derived from multiple chromosome locations.\ + \ As the structure of ecDNA can impact oncogene regulation and may indicate mechanisms\ + \ of its formation, disentangling it at high resolution from sequencing data is\ + \ essential. Even though methods have been developed to identify and reconstruct\ + \ ecDNA in cancer genome sequencing, it remains challenging to resolve complex\ + \ ecDNA structures, in particular amplicons with shared genomic footprints. We\ + \ here introduce Decoil, a computational method which combines a breakpoint-graph\ + \ approach with LASSO regression to reconstruct complex ecDNA and deconvolve co-occurring\ + \ ecDNA elements with overlapping genomic footprints from long-read nanopore sequencing.\ + \ Decoil outperforms de-novo assembly methods in simulated long-read sequencing\ + \ data for both, simple and complex ecDNAs. Applying Decoil on whole genome sequencing\ + \ data uncovered different ecDNA topologies and explored ecDNA structure heterogeneity\ + \ in neuroblastoma tumors and cell lines, indicating that this method may improve\ + \ ecDNA structural analyzes in cancer.},\n author = {M{\\u a}d{\\u a}lina Giurgiu\ + \ and Nadine Wittstruck and Elias Rodriguez-Fos and Roc{\\'i}o Chamorro Gonz{\\\ + 'a}lez and Lotte Br{\\\"u}ckner and Annabell Krienelke-Szymansky and Konstantin\ + \ Helmsauer and Anne Hartebrodt and Richard P. Koche and Kerstin Haase and Knut\ + \ Reinert and Anton G. Henssen},\n booktitle = {Decoil: Reconstructing extrachromosomal\ + \ DNA structural heterogeneity from long-read sequencing data},\n journal = {bioRxiv\ + \ - The Preprint Server for Biology},\n month = {November},\n title = {Decoil:\ + \ Reconstructing extrachromosomal DNA structural heterogeneity from long-read\ + \ sequencing data},\n url = {http://publications.imp.fu-berlin.de/3141/},\n year\ + \ = {2023}\n}\n" + book_title: 'Decoil: Reconstructing extrachromosomal DNA structural heterogeneity + from long-read sequencing data' + creators: + - name: + family: Giurgiu + given: "M\u0103d\u0103lina" + honourific: null + lineage: null + - name: + family: Wittstruck + given: Nadine + honourific: null + lineage: null + - name: + family: Rodriguez-Fos + given: Elias + honourific: null + lineage: null + - name: + family: "Chamorro Gonz\xE1lez" + given: "Roc\xEDo" + honourific: null + lineage: null + - name: + family: "Br\xFCckner" + given: Lotte + honourific: null + lineage: null + - name: + family: Krienelke-Szymansky + given: Annabell + honourific: null + lineage: null + - name: + family: Helmsauer + given: Konstantin + honourific: null + lineage: null + - name: + family: Hartebrodt + given: Anne + honourific: null + lineage: null + - name: + family: Koche + given: Richard P. + honourific: null + lineage: null + - name: + family: Haase + given: Kerstin + honourific: null + lineage: null + - name: + family: Reinert + given: Knut + honourific: null + lineage: null + - name: + family: Henssen + given: Anton G. + honourific: null + lineage: null + date: '2023-11-17' + datestamp: '2024-04-18 10:45:00' + dir: disk0/00/00/31/41 + divisions: + - group_algbioinf + eprint_status: archive + eprintid: 3141 + full_text_status: none + id_number: doi:10.1101/2023.11.15.567169 + ispublished: pub + key: fu_mi_publications3141 + lastmod: '2024-04-18 10:45:00' + metadata_visibility: show + official_url: https://doi.org/10.1101/2023.11.15.567169 + publication: bioRxiv - The Preprint Server for Biology + refereed: 'FALSE' + rev_number: 5 + status_changed: '2024-04-18 10:45:00' + subjects: + - G400 + title: 'Decoil: Reconstructing extrachromosomal DNA structural heterogeneity from + long-read sequencing data' + type: article + uri: http://publications.imp.fu-berlin.de/id/eprint/3141 + userid: 132 +- abstract: "The fast growth of public repositories of sequences greatly contributes\ + \ to the success of metagenomics applications. However, they are growing at a\ + \ much faster pace than the resources to use them properly. This challenges current\ + \ methods, which struggle to take full advantage of the massive and fast data\ + \ generation. We propose a generational leap in performance and usability with\ + \ ganon2, a sequence classification method that performs taxonomic binning and\ + \ profiling for metagenomics analysis. It indexes large datasets with a small\ + \ memory footprint, maintaining fast, sensitive, and precise classification results.\ + \ This is possible with the Hierarchical Interleaved Bloom Filter data structure\ + \ paired with minimizers and several other improvements and optimizations. Based\ + \ on the full NCBI RefSeq and its sub-sets, ganon2 indices are on average 50%\ + \ smaller than state-of-the-art methods, providing a great compression rate for\ + \ large and diverse genomic reference sets. Using 16 simulated samples from various\ + \ studies, including the CAMI 1+2 challenge, ganon2 achieved up to 0.17 higher\ + \ median F1-Score in taxonomic binning. In profiling, improvements in the F1-Score\ + \ median are up to 0.32 keeping a balanced L1-norm error in the abundance estimation.\ + \ ganon2\r\nis one of the fastest tools evaluated and enables the use of larger,\ + \ more diverse and up-to-date reference sets in daily microbiome analysis, improving\ + \ the resolution of results. The code is open-source and available with documentation\ + \ at\r\nhttps://github.com/pirovc/ganon" + bibtex: "@article{fu_mi_publications3142,\n abstract = {The fast growth of public\ + \ repositories of sequences greatly contributes to the success of metagenomics\ + \ applications. However, they are growing at a much faster pace than the resources\ + \ to use them properly. This challenges current methods, which struggle to take\ + \ full advantage of the massive and fast data generation. We propose a generational\ + \ leap in performance and usability with ganon2, a sequence classification method\ + \ that performs taxonomic binning and profiling for metagenomics analysis. It\ + \ indexes large datasets with a small memory footprint, maintaining fast, sensitive,\ + \ and precise classification results. This is possible with the Hierarchical Interleaved\ + \ Bloom Filter data structure paired with minimizers and several other improvements\ + \ and optimizations. Based on the full NCBI RefSeq and its sub-sets, ganon2 indices\ + \ are on average 50\\% smaller than state-of-the-art methods, providing a great\ + \ compression rate for large and diverse genomic reference sets. Using 16 simulated\ + \ samples from various studies, including the CAMI 1+2 challenge, ganon2 achieved\ + \ up to 0.17 higher median F1-Score in taxonomic binning. In profiling, improvements\ + \ in the F1-Score median are up to 0.32 keeping a balanced L1-norm error in the\ + \ abundance estimation. ganon2\nis one of the fastest tools evaluated and enables\ + \ the use of larger, more diverse and up-to-date reference sets in daily microbiome\ + \ analysis, improving the resolution of results. The code is open-source and available\ + \ with documentation at\nhttps://github.com/pirovc/ganon},\n author = {Vitor C.\ + \ Piro and Knut Reinert},\n booktitle = {ganon2: up-to-date and scalable metagenomics\ + \ analysis},\n journal = {bioRxiv preprint},\n month = {December},\n title = {ganon2:\ + \ up-to-date and scalable metagenomics\nanalysis},\n url = {http://publications.imp.fu-berlin.de/3142/},\n\ + \ year = {2023}\n}\n" + book_title: 'ganon2: up-to-date and scalable metagenomics analysis' + creators: + - name: + family: Piro + given: Vitor C. + honourific: null + lineage: null + - name: + family: Reinert + given: Knut + honourific: null + lineage: null + date: '2023-12-08' + datestamp: '2024-04-18 10:54:02' + dir: disk0/00/00/31/42 + divisions: + - group_algbioinf + eprint_status: archive + eprintid: 3142 + full_text_status: none + id_number: doi:10.1101/2023.12.07.570547 + ispublished: pub + key: fu_mi_publications3142 + lastmod: '2024-04-18 10:54:02' + metadata_visibility: show + official_url: https://doi.org/10.1101/2023.12.07.570547 + publication: bioRxiv preprint + refereed: 'FALSE' + rev_number: 5 + status_changed: '2024-04-18 10:54:02' + subjects: + - G400 + title: "ganon2: up-to-date and scalable metagenomics\r\nanalysis" + type: article + uri: http://publications.imp.fu-berlin.de/id/eprint/3142 + userid: 132 +- abstract: "Motivation\r\nLocal alignments of query sequences in large databases\ + \ represent a core part of metagenomic studies and facilitate homology search.\ + \ Following the development of NCBI Blast, many applications aimed to provide\ + \ faster and equally sensitive local alignment frameworks. Most applications focus\ + \ on protein alignments, while only few also facilitate DNA-based searches. None\ + \ of the established programs allow searching DNA sequences from bisulfite sequencing\ + \ experiments commonly used for DNA methylation profiling, for which specific\ + \ alignment strategies need to be implemented.\r\n\r\nResults\r\nHere, we introduce\ + \ Lambda3, a new version of the local alignment application Lambda. Lambda3 is\ + \ the first solution that enables the search of protein, nucleotide as well as\ + \ bisulfite-converted nucleotide query sequences. Its protein mode achieves comparable\ + \ performance to that of the highly optimized protein alignment application Diamond,\ + \ while the nucleotide mode consistently outperforms established local nucleotide\ + \ aligners. Combined, Lambda3 presents a universal local alignment framework that\ + \ enables fast and sensitive homology searches for a wide range of use-cases.\r\ + \n\r\nAvailability and implementation\r\nLambda3 is free and open-source software\ + \ publicly available at https://github.com/seqan/lambda/." + bibtex: "@article{fu_mi_publications3143,\n abstract = {Motivation\nLocal alignments\ + \ of query sequences in large databases represent a core part of metagenomic studies\ + \ and facilitate homology search. Following the development of NCBI Blast, many\ + \ applications aimed to provide faster and equally sensitive local alignment frameworks.\ + \ Most applications focus on protein alignments, while only few also facilitate\ + \ DNA-based searches. None of the established programs allow searching DNA sequences\ + \ from bisulfite sequencing experiments commonly used for DNA methylation profiling,\ + \ for which specific alignment strategies need to be implemented.\n\nResults\n\ + Here, we introduce Lambda3, a new version of the local alignment application Lambda.\ + \ Lambda3 is the first solution that enables the search of protein, nucleotide\ + \ as well as bisulfite-converted nucleotide query sequences. Its protein mode\ + \ achieves comparable performance to that of the highly optimized protein alignment\ + \ application Diamond, while the nucleotide mode consistently outperforms established\ + \ local nucleotide aligners. Combined, Lambda3 presents a universal local alignment\ + \ framework that enables fast and sensitive homology searches for a wide range\ + \ of use-cases.\n\nAvailability and implementation\nLambda3 is free and open-source\ + \ software publicly available at https://github.com/seqan/lambda/.},\n author\ + \ = {Hannes Hauswedell and Sara Hetzel and Simon G Gottlieb and Helene Kretzmer\ + \ and Alexander Meissner and Knut Reinert and Lenore Cowen},\n journal = {Bioinformatics},\n\ + \ month = {March},\n number = {3},\n publisher = {Oxford University Press},\n\ + \ title = {Lambda3: homology search for protein, nucleotide, and bisulfite-converted\ + \ sequences},\n url = {http://publications.imp.fu-berlin.de/3143/},\n volume =\ + \ {40},\n year = {2024}\n}\n" + creators: + - name: + family: Hauswedell + given: Hannes + honourific: null + lineage: null + - name: + family: Hetzel + given: Sara + honourific: null + lineage: null + - name: + family: Gottlieb + given: Simon G + honourific: null + lineage: null + - name: + family: Kretzmer + given: Helene + honourific: null + lineage: null + - name: + family: Meissner + given: Alexander + honourific: null + lineage: null + - name: + family: Reinert + given: Knut + honourific: null + lineage: null + - name: + family: Cowen + given: Lenore + honourific: null + lineage: null + date: '2024-03-14' + date_type: published + datestamp: '2024-04-18 11:00:25' + dir: disk0/00/00/31/43 + divisions: + - group_algbioinf + eprint_status: archive + eprintid: 3143 + full_text_status: none + id_number: doi:10.1093/bioinformatics/btae097 + ispublished: pub + issn: 1367-4811 + key: fu_mi_publications3143 + lastmod: '2024-04-18 11:44:50' + metadata_visibility: show + number: 3 + official_url: https://doi.org/10.1093/bioinformatics/btae097 + publication: Bioinformatics + publisher: Oxford University Press + refereed: 'TRUE' + rev_number: 8 + status_changed: '2024-04-18 11:44:50' + subjects: + - G400 + title: 'Lambda3: homology search for protein, nucleotide, and bisulfite-converted + sequences' + type: article + uri: http://publications.imp.fu-berlin.de/id/eprint/3143 + userid: 132 + volume: 40 +- abstract: "Data analysis workflows are popular for sequencing activities in large-scale\ + \ and complex scientific processes. Scheduling approaches attempt to find an appropriate\ + \ assignment of workflow tasks to the computing nodes for minimizing the makespan\ + \ in heterogeneous cluster infrastructures. A common feature of these approaches\ + \ is that they already know the structure of the workflow. However, for many workflows,\ + \ a high degree of parallelization can be achieved by splitting the large input\ + \ data of a single task into chunks and processing them independently. We call\ + \ this problem task granularity, which involves finding an assignment of tasks\ + \ to computing nodes\r\nand simultaneously optimizing the structure of a bag of\ + \ tasks. Accordingly, this paper addresses the problem of task granularity for\ + \ metagenomic workflows. To this end, we first formulated the problem as a mathematical\ + \ model. We then solved the proposed model using the genetic algorithm. To overcome\ + \ the challenge of not knowing the number of tasks, we adjusted the number of\ + \ tasks as a factor of the number of computing nodes. The procedure of increasing\ + \ the number of tasks is performed interactively and evolutionarily. Experimental\ + \ results showed that a desirable makespan value can be achieved after a few steps\ + \ of the increase." + bibtex: "@inproceedings{fu_mi_publications3144,\n abstract = {Data analysis workflows\ + \ are popular for sequencing activities in large-scale and complex scientific\ + \ processes. Scheduling approaches attempt to find an appropriate assignment of\ + \ workflow tasks to the computing nodes for minimizing the makespan in heterogeneous\ + \ cluster infrastructures. A common feature of these approaches is that they already\ + \ know the structure of the workflow. However, for many workflows, a high degree\ + \ of parallelization can be achieved by splitting the large input data of a single\ + \ task into chunks and processing them independently. We call this problem task\ + \ granularity, which involves finding an assignment of tasks to computing nodes\n\ + and simultaneously optimizing the structure of a bag of tasks. Accordingly, this\ + \ paper addresses the problem of task granularity for metagenomic workflows. To\ + \ this end, we first formulated the problem as a mathematical model. We then solved\ + \ the proposed model using the genetic algorithm. To overcome the challenge of\ + \ not knowing the number of tasks, we adjusted the number of tasks as a factor\ + \ of the number of computing nodes. The procedure of increasing the number of\ + \ tasks is performed interactively and evolutionarily. Experimental results showed\ + \ that a desirable makespan value can be achieved after a few steps of the increase.},\n\ + \ author = {Somayeh Mohammadi and Latif PourKarimi and Manuel Zsch{\\\"a}bitz\ + \ and Tristan Aretz and Ninon De Mecquenem and Ulf Leser and Knut Reinert},\n\ + \ booktitle = {EDBT/ICDT 2024 Joint Conference: 8th International workshop on\ + \ Data Analytics solutions for Real-LIfe APplications (DARLI-AP)},\n month = {March},\n\ + \ title = {Optimizing Job/Task Granularity for Metagenomic Workflows in\nHeterogeneous\ + \ Cluster Infrastructures},\n url = {http://publications.imp.fu-berlin.de/3144/},\n\ + \ year = {2024}\n}\n" + creators: + - name: + family: Mohammadi + given: Somayeh + honourific: null + lineage: null + - name: + family: PourKarimi + given: Latif + honourific: null + lineage: null + - name: + family: "Zsch\xE4bitz" + given: Manuel + honourific: null + lineage: null + - name: + family: Aretz + given: Tristan + honourific: null + lineage: null + - name: + family: De Mecquenem + given: Ninon + honourific: null + lineage: null + - name: + family: Leser + given: Ulf + honourific: null + lineage: null + - name: + family: Reinert + given: Knut + honourific: null + lineage: null + date: '2024-03-25' + date_type: published + datestamp: '2024-04-18 11:34:05' + dir: disk0/00/00/31/44 + divisions: + - group_algbioinf + eprint_status: archive + eprintid: 3144 + event_title: 'EDBT/ICDT 2024 Joint Conference: 8th International workshop on Data + Analytics solutions for Real-LIfe APplications (DARLI-AP)' + event_type: workshop + full_text_status: none + ispublished: pub + key: fu_mi_publications3144 + lastmod: '2024-04-18 11:34:05' + metadata_visibility: show + official_url: https://ceur-ws.org/Vol-3651/DARLI-AP-15.pdf + pres_type: paper + refereed: 'FALSE' + related_url: + - url: https://ceur-ws.org/Vol-3651/ + rev_number: 7 + status_changed: '2024-04-18 11:34:05' + subjects: + - G400 + title: "Optimizing Job/Task Granularity for Metagenomic Workflows in\r\nHeterogeneous\ + \ Cluster Infrastructures" + type: conference_item + uri: http://publications.imp.fu-berlin.de/id/eprint/3144 + userid: 132 - bibtex: "@article{fu_mi_publications324,\n journal = {Science},\n keywords = {ASSEMBLY},\n\ \ number = {5461},\n pages = {2185--2195},\n title = {The Genome Sequence of Drosophila\ \ melanogaster},\n url = {http://publications.imp.fu-berlin.de/324/},\n volume\