From e8e9b8adb08a281040a198f217d3505fb4dafd99 Mon Sep 17 00:00:00 2001 From: cl117 Date: Sat, 6 Jul 2024 20:08:40 -0600 Subject: [PATCH 01/26] update dependencies --- flask/requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flask/requirements.txt b/flask/requirements.txt index 41a7f8e..7b90b71 100644 --- a/flask/requirements.txt +++ b/flask/requirements.txt @@ -7,11 +7,11 @@ Flask==1.0.2 idna==2.7 ipaddress==1.0.22 itsdangerous==0.24 -Jinja2==2.10 -MarkupSafe==1.0 -numpy==1.14.5 +Jinja2 +MarkupSafe==2.0.1 +numpy==1.26.4 python-dateutil==2.7.3 requests==2.19.1 six==1.11.0 urllib3==1.23 -Werkzeug==0.14.1 +Werkzeug==2.1.2 From 74c5cd47ab625743d3bf176ca95bb32144f62018 Mon Sep 17 00:00:00 2001 From: cl117 Date: Tue, 13 Aug 2024 22:52:27 -0600 Subject: [PATCH 02/26] optimize search.py --- flask/explorer.py | 56 +++- flask/index.py | 1 - flask/requirements.txt | 5 +- flask/search.py | 600 ++++++++++++++--------------------------- 4 files changed, 262 insertions(+), 400 deletions(-) diff --git a/flask/explorer.py b/flask/explorer.py index 7401d7e..ee30121 100644 --- a/flask/explorer.py +++ b/flask/explorer.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -from flask import Flask, request, jsonify, abort +from flask import Flask, request, jsonify, abort, render_template from werkzeug.exceptions import HTTPException import os @@ -15,11 +15,44 @@ import threading import time +from flask_debugtoolbar import DebugToolbarExtension +from flask_debugtoolbar_lineprofilerpanel.profile import line_profile + +def profile_flask_app(): + app.run(debug=True) + +if __name__ == "__main__": + #profiler = profile.Profile() + #profiler.enable() + profile_flask_app() + #profiler.disable() + #profiler.print_stats(sort='time') + log = logging.getLogger('werkzeug') log.setLevel(logging.ERROR) app = Flask(__name__) +app.config['SECRET_KEY'] = 'your-secret-key' # Required for the debug toolbar +app.config['DEBUG'] = True +app.config['DEBUG_TB_INTERCEPT_REDIRECTS'] = False +# Profiler configuration +app.config['DEBUG_TB_PROFILER_ENABLED'] = True +app.config['DEBUG_TB_PANELS'] = [ + 'flask_debugtoolbar.panels.versions.VersionDebugPanel', + 'flask_debugtoolbar.panels.timer.TimerDebugPanel', + 'flask_debugtoolbar.panels.headers.HeaderDebugPanel', + 'flask_debugtoolbar.panels.request_vars.RequestVarsDebugPanel', + 'flask_debugtoolbar.panels.config_vars.ConfigVarsDebugPanel', + 'flask_debugtoolbar.panels.template.TemplateDebugPanel', + 'flask_debugtoolbar.panels.logger.LoggingPanel', + 'flask_debugtoolbar.panels.profiler.ProfilerDebugPanel', + 'flask_debugtoolbar_lineprofilerpanel.panels.LineProfilerPanel' +] + +# Initialize the debug toolbar +toolbar = DebugToolbarExtension(app) + @app.errorhandler(Exception) def handle_error(e): @@ -36,9 +69,9 @@ def startup(): def auto_update_index(): while True: time.sleep(int(utils.get_config()['updateTimeInDays']) * 86400) - if utils.get_config()['autoUpdateIndex'] and utils.get_config()['updateTimeInDays'] > 0: - utils.log('Updating index automatically. To disable, set the \"autoUpdateIndex\" property in config.json to false.') - update() + # if utils.get_config()['autoUpdateIndex'] and utils.get_config()['updateTimeInDays'] > 0: + # utils.log('Updating index automatically. To disable, set the \"autoUpdateIndex\" property in config.json to false.') + # update() # Thread for automatically updaing the index periodically update_thread = threading.Thread(target=auto_update_index, daemon=True) @@ -64,7 +97,6 @@ def handle_error(e): utils.log('[ERROR] Returning error ' + str(e) + "\n Traceback:\n" + traceback.format_exc()) return jsonify(error=str(e)), 500 - @app.route('/info', methods=['GET']) def info(): utils.log('Explorer up!!! Virtutoso ' + str(query.memoized_query_sparql.cache_info())) @@ -161,8 +193,13 @@ def incremental_remove_collection(): except: raise +@app.route('/test', methods=['GET']) +@line_profile +def SBOLExplore_test_endpoint(): + return render_template('index.html') @app.route('/', methods=['GET']) +@line_profile def sparql_search_endpoint(): try: # make sure index is built, or throw exception @@ -173,7 +210,13 @@ def sparql_search_endpoint(): if sparql_query is not None: default_graph_uri = request.args.get('default-graph-uri') - response = jsonify(search.search(sparql_query, utils.get_uri2rank(), utils.get_clusters(), default_graph_uri)) + response = jsonify( + search.search( + sparql_query, + utils.get_uri2rank(), + utils.get_clusters(), + default_graph_uri + )) return response else: return "

Welcome to SBOLExplorer!

The available indices in Elasticsearch are shown below:


"\ @@ -183,6 +226,7 @@ def sparql_search_endpoint(): + "



Visit our GitHub repository!"\ + "

Any issues can be reported to our issue tracker."\ + "

Used by SynBioHub." + #return render_template('index.html') except: raise diff --git a/flask/index.py b/flask/index.py index 070ee7b..cb38309 100644 --- a/flask/index.py +++ b/flask/index.py @@ -3,7 +3,6 @@ import query import json - def add_pagerank(parts_response, uri2rank): """ Adds the pagerank score for each part diff --git a/flask/requirements.txt b/flask/requirements.txt index 7b90b71..0ebff0e 100644 --- a/flask/requirements.txt +++ b/flask/requirements.txt @@ -9,9 +9,10 @@ ipaddress==1.0.22 itsdangerous==0.24 Jinja2 MarkupSafe==2.0.1 -numpy==1.26.4 +numpy python-dateutil==2.7.3 requests==2.19.1 six==1.11.0 urllib3==1.23 -Werkzeug==2.1.2 +Werkzeug +apscheduler==3.10.4 diff --git a/flask/search.py b/flask/search.py index dd6a272..4d316db 100644 --- a/flask/search.py +++ b/flask/search.py @@ -1,18 +1,27 @@ import re +from typing import List, Dict, Tuple, Optional import utils import query import sequencesearch +# Compile regex patterns +FROM_COUNT_PATTERN = re.compile(r'SELECT \(count\(distinct \?subject\) as \?tempcount\)\s*(.*)\s*WHERE {') +FROM_NORMAL_PATTERN = re.compile(r'\?type\n(.*)\s*WHERE {') +CRITERIA_PATTERN = re.compile(r'WHERE {\s*(.*)\s*\?subject a \?type \.') +OFFSET_PATTERN = re.compile(r'OFFSET (\d+)') +LIMIT_PATTERN = re.compile(r'LIMIT (\d+)') +SEQUENCE_PATTERN = re.compile(r'\s*\?subject sbol2:sequence \?seq \.\s*\?seq sbol2:elements \"([a-zA-Z]*)\"') +FLAG_PATTERN = re.compile(r'# flag_([a-zA-Z0-9._]*): ([a-zA-Z0-9./-_]*)') +KEYWORD_PATTERN = re.compile(r"CONTAINS\(lcase\(\?displayId\), lcase\('([^']*)'\)\)") -def search_es(es_query): + +def extract_offset(sparql_query): + offset_match = OFFSET_PATTERN.search(sparql_query) + return int(offset_match.group(1)) if offset_match else 0 + +def search_es(es_query: str) -> Dict: """ - String query for ES searches - - Arguments: - es_query {string} -- String to search for - - Returns: - List -- List of all search results + String query for ES searches. """ body = { 'query': { @@ -45,26 +54,15 @@ def search_es(es_query): } try: return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) - except: + except Exception as e: + utils.log(f"ES search failed: {e}") raise - -def empty_search_es(offset, limit, allowed_graphs): +def empty_search_es(offset: int, limit: int, allowed_graphs: List[str]) -> Dict: """ - Empty string search based solely on pagerank - - Arguments: - offset {int} -- Offset for search results - limit {int} -- Size of search - allowed_graphs {List} -- List of allowed graphs to search on - - Returns: - List -- List of search results + Empty string search based solely on pagerank. """ - if len(allowed_graphs) == 1: - query = {'term': {'graph': allowed_graphs[0]}} - else: - query = {'terms': {'graph': allowed_graphs}} + query = {'term': {'graph': allowed_graphs[0]}} if len(allowed_graphs) == 1 else {'terms': {'graph': allowed_graphs}} body = { 'query': { @@ -82,20 +80,13 @@ def empty_search_es(offset, limit, allowed_graphs): } try: return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) - except: + except Exception as e: + utils.log(f"ES search failed: {e}") raise - -def search_es_allowed_subjects(es_query, allowed_subjects): +def search_es_allowed_subjects(es_query: str, allowed_subjects: List[str]) -> Dict: """ - String query for ES searches limited to allowed parts - - Arguments: - es_query {string} -- String to search for - allowed_subjects {list} - list of allowed subjects from Virtuoso - - Returns: - List -- List of all search results + String query for ES searches limited to allowed parts. """ body = { 'query': { @@ -107,7 +98,7 @@ def search_es_allowed_subjects(es_query, allowed_subjects): 'query': es_query, 'fields': [ 'subject', - 'displayId^3', # caret indicates displayId is 3 times as important during search + 'displayId^3', 'version', 'name', 'description', @@ -117,37 +108,29 @@ def search_es_allowed_subjects(es_query, allowed_subjects): 'operator': 'or', 'fuzziness': 'AUTO' }}, - {'ids': {'values': list(allowed_subjects)}} + {'ids': {'values': allowed_subjects}} ] } }, 'script_score': { 'script': { - 'source': "_score * Math.log(doc['pagerank'].value + 1)" # Math.log is a natural log + 'source': "_score * Math.log(doc['pagerank'].value + 1)" } - }, - - }, - + } + } }, 'from': 0, 'size': 10000 } try: return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) - except: + except Exception as e: + utils.log(f"ES search failed: {e}") raise - -def search_es_allowed_subjects_empty_string(allowed_subjects): +def search_es_allowed_subjects_empty_string(allowed_subjects: List[str]) -> Dict: """ - ES search purely limited to allowed parts - - Arguments: - allowed_subjects {list} - list of allowed subjects from Virtuoso - - Returns: - List -- List of all search results + ES search purely limited to allowed parts. """ body = { 'query': { @@ -155,27 +138,57 @@ def search_es_allowed_subjects_empty_string(allowed_subjects): 'query': { 'bool': { 'must': [ - {'ids': {'values': list(allowed_subjects)}} + {'ids': {'values': allowed_subjects}} ] } }, 'script_score': { 'script': { - 'source': "_score * Math.log(doc['pagerank'].value + 1)" # Math.log is a natural log + 'source': "_score * Math.log(doc['pagerank'].value + 1)" } - }, - - }, - + } + } }, 'from': 0, 'size': 10000 } try: return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) - except: + except Exception as e: + utils.log(f"ES search failed: {e}") raise +def parse_sparql_query(sparql_query, is_count_query): + # Find FROM clause + _from_search = FROM_COUNT_PATTERN.search(sparql_query) if is_count_query else FROM_NORMAL_PATTERN.search(sparql_query) + _from = _from_search.group(1).strip() if _from_search else '' + + # Find criteria + criteria_search = CRITERIA_PATTERN.search(sparql_query) + criteria = criteria_search.group(1).strip() if criteria_search else '' + + # Find offset + offset_match = OFFSET_PATTERN.search(sparql_query) + offset = int(offset_match.group(1)) if offset_match else 0 + # Find limit + limit_match = LIMIT_PATTERN.search(sparql_query) + limit = int(limit_match.group(1)) if limit_match else 50 + + # Find sequence + sequence_match = SEQUENCE_PATTERN.search(sparql_query) + sequence = sequence_match.group(1) if sequence_match else '' + + # Extract flags + flags = {match.group(1): match.group(2) for match in FLAG_PATTERN.finditer(sparql_query)} + + # Extract keywords + keywords = KEYWORD_PATTERN.findall(criteria) + + # Construct es_query + es_query = ' '.join(keywords).strip() + print("Hello es_query: ", es_query) + + return es_query, _from, criteria, offset, limit, sequence, flags def extract_query(sparql_query): """ @@ -187,225 +200,75 @@ def extract_query(sparql_query): Returns: List -- List of information extracted """ - _from = '' - if is_count_query(sparql_query): - _from_search = re.search(r'''SELECT \(count\(distinct \?subject\) as \?tempcount\)\s*(.*)\s*WHERE {''', - sparql_query) - else: - _from_search = re.search(r'''\?type\n(.*)\s*WHERE {''', sparql_query) - if _from_search: - _from = _from_search.group(1).strip() - - criteria = '' - criteria_search = re.search(r'''WHERE {\s*(.*)\s*\?subject a \?type \.''', sparql_query) - if criteria_search: - criteria = criteria_search.group(1).strip() - - offset = 0 - offset_search = re.search(r'''OFFSET (\d*)''', sparql_query) - if offset_search: - offset = int(offset_search.group(1)) - - limit = 50 - limit_search = re.search(r'''LIMIT (\d*)''', sparql_query) - if limit_search: - limit = int(limit_search.group(1)) - - sequence = '' - sequence_search = re.search(r'''\s*\?subject sbol2:sequence \?seq \.\s*\?seq sbol2:elements \"([a-zA-Z]*)\"''', - sparql_query) - if sequence_search: - sequence = sequence_search.group(1) - - flags = {} - flag_search = re.finditer(r'''# flag_([a-zA-Z0-9._]*): ([a-zA-Z0-9./-_]*)''', sparql_query) - for flag in flag_search: - flags[flag.group(1)] = flag.group(2) - - extract_keyword_re = re.compile(r'''CONTAINS\(lcase\(\?displayId\), lcase\('([^']*)'\)\)''') - keywords = [] - for keyword in re.findall(extract_keyword_re, criteria): - keywords.append(keyword) - es_query = ' '.join(keywords).strip() - - return es_query, _from, criteria, offset, limit, sequence, flags - - -def extract_allowed_graphs(_from, default_graph_uri): - """ - Extracts the allowed graphs to search over + return parse_sparql_query(sparql_query, is_count_query(sparql_query)) - Arguments: - _from {string} -- Graph where search originated - default_graph_uri {string} -- The default graph URI pulled from SBH - Returns: - List -- List of allowed graphs +def extract_allowed_graphs(_from: str, default_graph_uri: str) -> List[str]: """ - allowed_graphs = [] - + Extracts the allowed graphs to search over. + """ + allowed_graphs = [default_graph_uri] if not _from else [graph.strip()[1:-1] for graph in _from.split('FROM') if graph.strip()] if utils.get_config()['distributed_search']: - instances = utils.get_wor() - for instance in instances: - allowed_graphs.append(instance['instanceUrl'] + '/public') + allowed_graphs.extend(instance['instanceUrl'] + '/public' for instance in utils.get_wor()) + return allowed_graphs - if _from == '': - allowed_graphs.append(default_graph_uri) - return allowed_graphs - else: - for graph in _from.split('FROM'): - graph = graph.strip() - graph = graph[1:len(graph) - 1] - - if graph != '': - allowed_graphs.append(graph) - - return allowed_graphs - - -def is_count_query(sparql_query): +def is_count_query(sparql_query: str) -> bool: return 'SELECT (count(distinct' in sparql_query - -def create_response(count, bindings, return_count): +def create_response(count: int, bindings: List[Dict], return_count: bool) -> Dict: """ - Creates response to be sent back to SBH - - Arguments: - count {int} -- ? - bindings {Dict} -- The bindings - return_count {int} -- ? - - Returns: - ? -- ? + Creates response to be sent back to SBH. """ if return_count: - response = {"head": - {"link": [], "vars": ["count"]}, - "results": {"distinct": False, "ordered": True, - "bindings": [{"count": { - "type": "typed-literal", - "datatype": "http://www.w3.org/2001/XMLSchema#integer", - "value": "10"}}]}} - response['results']['bindings'][0]['count']['value'] = str(count) - else: - response = {"head": {"link": [], - "vars": ["subject", "displayId", "version", "name", "description", "type", "percentMatch", - "strandAlignment", "CIGAR"]}, - "results": {"distinct": False, "ordered": True, "bindings": []}} - response['results']['bindings'] = bindings - - return response - + return { + "head": {"link": [], "vars": ["count"]}, + "results": { + "distinct": False, + "ordered": True, + "bindings": [{"count": { + "type": "typed-literal", + "datatype": "http://www.w3.org/2001/XMLSchema#integer", + "value": str(count) + } + }] + } + } + return { + "head": { + "link": [], + "vars": ["subject", "displayId", "version", "name", "description", "type", "percentMatch", "strandAlignment", "CIGAR"] + }, + "results": {"distinct": False, "ordered": True, "bindings": bindings} + } -def create_binding(subject, displayId, version, name, description, _type, role, sbol_type, order_by, percentMatch=-1, - strandAlignment='N/A', CIGAR='N/A'): +def create_binding(subject: str, displayId: Optional[str], version: Optional[int], name: Optional[str], description: Optional[str], + _type: Optional[str], role: Optional[str], sbol_type: Optional[str], order_by: Optional[float], + percentMatch: float = -1, strandAlignment: str = 'N/A', CIGAR: str = 'N/A') -> Dict: """ - Creates bindings to be sent to SBH - - Arguments: - subject {string} -- URI of part - displayId {string} -- DisplayId of part - version {int} -- Version of part - name {string} -- Name of part - description {string} -- Description of part - _type {string} -- SBOL type of part - role {string} -- S.O. role of part - order_by {?} -- ? - - Keyword Arguments: - percentMatch {number} -- Percent match of query part to the target part (default: {-1}) - strandAlignment {str} -- Strand alignment of the query part relatve to the target part (default: {'N/A'}) - CIGAR {str} -- Alignment of query part relative to the target part (default: {'N/A'}) - - Returns: - Dict -- Part and its information + Creates bindings to be sent to SBH. """ binding = {} - - if subject is not None: - binding["subject"] = { - "type": "uri", - "datatype": "http://www.w3.org/2001/XMLSchema#uri", - "value": subject - } - - if displayId is not None: - binding["displayId"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": displayId - } - - if version is not None: - binding["version"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": version - } - - if name is not None: - binding["name"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": name - } - - if description is not None: - binding["description"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": description - } - - if _type is not None: - binding["type"] = { - "type": "uri", - "datatype": "http://www.w3.org/2001/XMLSchema#uri", - "value": _type - } - - if role is not None: - binding["role"] = { - "type": "uri", - "datatype": "http://www.w3.org/2001/XMLSchema#uri", - "value": role - } - - if sbol_type is not None: - binding["sboltype"] = { - "type": "uri", - "datatype": "http://www.w3.org/2001/XMLSchema#uri", - "value": sbol_type - } - - if order_by is not None: - binding["order_by"] = order_by - - if percentMatch != -1: - binding["percentMatch"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": str(percentMatch) - } - - if strandAlignment != 'N/A': - binding["strandAlignment"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": strandAlignment - } - - if CIGAR != 'N/A': - binding["CIGAR"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": CIGAR - } - + attributes = { + "subject": subject, + "displayId": displayId, + "version": str(version) if version is not None else None, + "name": name, + "description": description, + "type": _type, + "role": role, + "sboltype": sbol_type, + "order_by": order_by, + "percentMatch": str(percentMatch) if percentMatch != -1 else None, + "strandAlignment": strandAlignment if strandAlignment != 'N/A' else None, + "CIGAR": CIGAR if CIGAR != 'N/A' else None + } + for key, value in attributes.items(): + if value is not None: + datatype = "http://www.w3.org/2001/XMLSchema#uri" if key in ["subject", "type", "role", "sboltype"] else "http://www.w3.org/2001/XMLSchema#string" + ltype = "uri" if key in ["subject", "type", "role", "sboltype"] else "literal" + binding[key] = {"type": ltype, "value": str(value), "datatype": datatype} if not key=="order_by" else order_by return binding - def create_bindings(es_response, clusters, allowed_graphs, allowed_subjects=None): """ Creates the mass binding consisting of all parts in the search @@ -424,40 +287,43 @@ def create_bindings(es_response, clusters, allowed_graphs, allowed_subjects=None bindings = [] cluster_duplicates = set() + allowed_subjects_set = set(allowed_subjects) if allowed_subjects else None + for hit in es_response['hits']['hits']: _source = hit['_source'] _score = hit['_score'] subject = _source['subject'] - if allowed_subjects is not None and subject not in allowed_subjects: + if allowed_subjects_set and subject not in allowed_subjects_set: continue - if _source.get('graph') not in allowed_graphs: + graph = _source.get('graph') + if graph not in allowed_graphs: continue if subject in cluster_duplicates: - _score = _score / 2.0 + _score /= 2.0 elif subject in clusters: cluster_duplicates.update(clusters[subject]) - if _source.get('type') is not None and 'http://sbols.org/v2#Sequence' in _source.get('type'): - _score = _score / 10.0 - - binding = create_binding(subject, - _source.get('displayId'), - _source.get('version'), - _source.get('name'), - _source.get('description'), - _source.get('type'), - _source.get('role'), - _source.get('sboltype'), - _score - ) + if 'http://sbols.org/v2#Sequence' in _source.get('type', ''): + _score /= 10.0 + + binding = create_binding( + subject, + _source.get('displayId'), + _source.get('version'), + _source.get('name'), + _source.get('description'), + _source.get('type'), + _source.get('role'), + _source.get('sboltype'), + _score + ) bindings.append(binding) return bindings - def create_criteria_bindings(criteria_response, uri2rank, sequence_search=False, ucTableName=''): """ Creates binding for all non-string or non-empty searches @@ -474,47 +340,45 @@ def create_criteria_bindings(criteria_response, uri2rank, sequence_search=False, Dict -- Binding of parts """ bindings = [] - parts = (p for p in criteria_response if p.get('role') is None or 'http://wiki.synbiohub.org' in p.get('role')) - for part in parts: + for part in criteria_response: subject = part.get('subject') + pagerank = uri2rank.get(subject, 1) - if subject not in uri2rank: - pagerank = 1 - else: - pagerank = uri2rank[subject] - - if part.get('type') is not None and 'http://sbols.org/v2#Sequence' in part.get('type'): - pagerank = pagerank / 10.0 + if 'http://sbols.org/v2#Sequence' in part.get('type', ''): + pagerank /= 10.0 if sequence_search: - pagerank = pagerank * (float(get_percent_match(part.get('subject'), ucTableName)) / 100) - binding = create_binding(part.get('subject'), - part.get('displayId'), - part.get('version'), - part.get('name'), - part.get('description'), - part.get('type'), - part.get('role'), - part.get('sboltype'), - pagerank, - get_percent_match(part.get('subject'), ucTableName), - get_strand_alignment(part.get('subject'), ucTableName), - get_cigar_data(part.get('subject'), ucTableName)) - + percent_match = float(get_percent_match(subject, ucTableName)) / 100 + binding = create_binding( + subject, + part.get('displayId'), + part.get('version'), + part.get('name'), + part.get('description'), + part.get('type'), + part.get('role'), + part.get('sboltype'), + pagerank * percent_match, + percent_match, + get_strand_alignment(subject, ucTableName), + get_cigar_data(subject, ucTableName) + ) else: - binding = create_binding(part.get('subject'), - part.get('displayId'), - part.get('version'), - part.get('name'), - part.get('description'), - part.get('type'), - part.get('role'), - part.get('sboltype'), - pagerank) + binding = create_binding( + subject, + part.get('displayId'), + part.get('version'), + part.get('name'), + part.get('description'), + part.get('type'), + part.get('role'), + part.get('sboltype'), + pagerank + ) bindings.append(binding) - return bindings + return bindings def get_allowed_subjects(criteria_response): """ @@ -525,15 +389,10 @@ def get_allowed_subjects(criteria_response): Returns: Parts the user is allowed to see """ - subjects = set() - - for part in criteria_response: - subjects.add(part['subject']) - - return subjects - + return {part['subject'] for part in criteria_response} def create_similar_criteria(criteria, clusters): + """ Adds filter to query to be sent to Virtuoso Args: @@ -548,7 +407,8 @@ def create_similar_criteria(criteria, clusters): if subject not in clusters or not clusters[subject]: return 'FILTER (?subject != ?subject)' - return 'FILTER (' + ' || '.join(['?subject = <' + duplicate + '>' for duplicate in clusters[subject]]) + ')' + filters = ' || '.join(f'?subject = <{duplicate}>' for duplicate in clusters[subject]) + return f'FILTER ({filters})' def create_sequence_criteria(criteria, uris): @@ -561,10 +421,10 @@ def create_sequence_criteria(criteria, uris): Returns: String containing a SPARQL filter """ - if len(uris) == 0: + if not uris: return '' - - return 'FILTER (' + ' || '.join(['?subject = <' + uri + '>' for uri in uris]) + ')' + filters = ' || '.join(f'?subject = <{uri}>' for uri in uris) + return f'FILTER ({filters})' def parse_allowed_graphs(allowed_graphs): @@ -576,12 +436,7 @@ def parse_allowed_graphs(allowed_graphs): Returns: List of allowed graphs """ - result = '' - for allowed_graph in allowed_graphs: - if allowed_graph is not None: - result += 'FROM <' + allowed_graph + '> ' - return result - + return ' '.join(f'FROM <{graph}>' for graph in allowed_graphs if graph) def search(sparql_query, uri2rank, clusters, default_graph_uri): """ @@ -596,7 +451,7 @@ def search(sparql_query, uri2rank, clusters, default_graph_uri): """ es_query, _from, criteria, offset, limit, sequence, flags = extract_query(sparql_query) - + if criteria.strip() == 'FILTER ()': criteria = '' @@ -610,17 +465,16 @@ def search(sparql_query, uri2rank, clusters, default_graph_uri): allowed_uris = filter_sequence_search_subjects(_from, results) criteria_response = query.query_parts(_from) # Filter searches by URI to hide private parts here instead of on Virtuoso - criteria_response_filtered = [c for c in criteria_response if any(f for f in allowed_uris if f in c.get('subject'))] + criteria_response_filtered = [c for c in criteria_response if any(f in c.get('subject', '') for f in allowed_uris)] bindings = create_criteria_bindings(criteria_response_filtered, uri2rank, True, filename[:-4] + '.uc') - elif len(sequence.strip()) > 0: + elif sequence.strip(): # send sequence search to search.py temp_filename = sequencesearch.write_to_temp(sequence) results = sequencesearch.sequence_search(flags, temp_filename) - allowed_uris = filter_sequence_search_subjects(_from, results) criteria_response = query.query_parts(_from) - criteria_response_filtered = [c for c in criteria_response if any(f for f in allowed_uris if f in c.get('subject'))] + criteria_response_filtered = [c for c in criteria_response if any(f in c.get('subject', '') for f in allowed_uris)] bindings = create_criteria_bindings(criteria_response_filtered, uri2rank, True, temp_filename[:-4] + '.uc') elif 'SIMILAR' in criteria: @@ -634,37 +488,40 @@ def search(sparql_query, uri2rank, clusters, default_graph_uri): criteria_response = query.query_parts(_from, criteria) bindings = create_criteria_bindings(criteria_response, uri2rank) - elif es_query == '' and filterless_criteria == '': + elif es_query == '' and not filterless_criteria: # empty search es_response = empty_search_es(offset, limit, allowed_graphs) bindings = create_bindings(es_response, clusters, allowed_graphs) - bindings.sort(key=lambda binding: binding['order_by'], reverse=True) + bindings.sort(key=lambda b: b['order_by'], reverse=True) return create_response(es_response['hits']['total'], bindings, is_count_query(sparql_query)) else: - - if filterless_criteria == '': + if not filterless_criteria: es_response = search_es(es_query) # pure string search bindings = create_bindings(es_response, clusters, allowed_graphs) - else: # advanced search and string search criteria_response = query.query_parts(_from, filterless_criteria) allowed_subjects = get_allowed_subjects(criteria_response) - if es_query == '': - es_allowed_subject = search_es_allowed_subjects_empty_string(allowed_subjects) - else: - es_allowed_subject = search_es_allowed_subjects(es_query, allowed_subjects) + es_allowed_subject = (search_es_allowed_subjects_empty_string(allowed_subjects) + if es_query == '' + else search_es_allowed_subjects(es_query, allowed_subjects)) bindings = create_bindings(es_allowed_subject, clusters, allowed_graphs, allowed_subjects) utils.log('Advanced string search complete.') - bindings.sort(key=lambda binding: binding['order_by'], reverse=True) - + bindings.sort(key=lambda b: b['order_by'], reverse=True) return create_response(len(bindings), bindings[offset:offset + limit], is_count_query(sparql_query)) +def get_info_from_uc_table(uri, ucTableName, column_index): + with open(ucTableName, 'r') as file: + for line in file: + parts = line.split() + if parts[9] == uri: + return parts[column_index] + return 'N/A' def get_percent_match(uri, ucTableName): """ @@ -676,16 +533,7 @@ def get_percent_match(uri, ucTableName): Returns: Percent match if available, else -1 """ - with open(ucTableName, 'r') as read: - uc_reader = read.read() - lines = uc_reader.splitlines() - - for line in lines: - line = line.split() - if line[9] == uri: - return line[3] - - return -1 + return get_info_from_uc_table(uri, ucTableName, 3) def get_strand_alignment(uri, ucTableName): @@ -698,38 +546,10 @@ def get_strand_alignment(uri, ucTableName): Returns: + or - """ - with open(ucTableName, 'r') as read: - uc_reader = read.read() - lines = uc_reader.splitlines() - - for line in lines: - line = line.split() - if line[9] == uri: - return line[4] - - return 'N/A' - + return get_info_from_uc_table(uri, ucTableName, 4) def get_cigar_data(uri, ucTableName): - """ - Gets the CIGAR data of a part (see https://genome.sph.umich.edu/wiki/SAM) - Args: - uri: URI of the part - ucTableName: UClust table - - Returns: CIGAR data if found, or N/A - - """ - with open(ucTableName, 'r') as read: - uc_reader = read.read() - lines = uc_reader.splitlines() - - for line in lines: - line = line.split() - if line[9] == uri: - return line[7] - - return 'N/A' + return get_info_from_uc_table(uri, ucTableName, 7) def filter_sequence_search_subjects(_from, uris): """ @@ -741,9 +561,7 @@ def filter_sequence_search_subjects(_from, uris): _from {list} -- List of allowed graphs uris {list} -- List of URI's from sequence search """ - from_uris = [] - result = re.findall(r"\<([A-Za-z0-9:\/.]+)\>*", _from) - for r in result: - from_uris.append(r) - - return [uri for uri in uris if any(f for f in from_uris if f in uri)] + from_uris = set(re.findall(r"\<([A-Za-z0-9:\/.]+)\>*", _from)) + return [uri for uri in uris if any(f in uri for f in from_uris)] + + From db907e64b23432f9f4b8c636472fa3dbaa065230 Mon Sep 17 00:00:00 2001 From: cl117 Date: Thu, 15 Aug 2024 11:17:27 -0600 Subject: [PATCH 03/26] optimize explorer.py --- flask/explorer.py | 223 +++++++++++++++++++++------------------------- 1 file changed, 103 insertions(+), 120 deletions(-) diff --git a/flask/explorer.py b/flask/explorer.py index ee30121..c10f8e9 100644 --- a/flask/explorer.py +++ b/flask/explorer.py @@ -2,10 +2,14 @@ from flask import Flask, request, jsonify, abort, render_template from werkzeug.exceptions import HTTPException - import os import traceback import logging +import threading +import time +from flask_debugtoolbar import DebugToolbarExtension +from flask_debugtoolbar_lineprofilerpanel.profile import line_profile + import cluster import pagerank import index @@ -13,93 +17,95 @@ import utils import query -import threading -import time -from flask_debugtoolbar import DebugToolbarExtension -from flask_debugtoolbar_lineprofilerpanel.profile import line_profile - - -def profile_flask_app(): - app.run(debug=True) - -if __name__ == "__main__": - #profiler = profile.Profile() - #profiler.enable() - profile_flask_app() - #profiler.disable() - #profiler.print_stats(sort='time') - +# Configure logging, This will affect all loggers in your application, not just the Werkzeug logger. log = logging.getLogger('werkzeug') log.setLevel(logging.ERROR) app = Flask(__name__) -app.config['SECRET_KEY'] = 'your-secret-key' # Required for the debug toolbar -app.config['DEBUG'] = True -app.config['DEBUG_TB_INTERCEPT_REDIRECTS'] = False -# Profiler configuration -app.config['DEBUG_TB_PROFILER_ENABLED'] = True -app.config['DEBUG_TB_PANELS'] = [ - 'flask_debugtoolbar.panels.versions.VersionDebugPanel', - 'flask_debugtoolbar.panels.timer.TimerDebugPanel', - 'flask_debugtoolbar.panels.headers.HeaderDebugPanel', - 'flask_debugtoolbar.panels.request_vars.RequestVarsDebugPanel', - 'flask_debugtoolbar.panels.config_vars.ConfigVarsDebugPanel', - 'flask_debugtoolbar.panels.template.TemplateDebugPanel', - 'flask_debugtoolbar.panels.logger.LoggingPanel', - 'flask_debugtoolbar.panels.profiler.ProfilerDebugPanel', - 'flask_debugtoolbar_lineprofilerpanel.panels.LineProfilerPanel' -] +app.config.update( + SECRET_KEY='your-secret-key', # Required for the debug toolbar + DEBUG=True, + DEBUG_TB_INTERCEPT_REDIRECTS=False, + DEBUG_TB_PROFILER_ENABLED=True, + DEBUG_TB_PANELS=[ + 'flask_debugtoolbar.panels.versions.VersionDebugPanel', + 'flask_debugtoolbar.panels.timer.TimerDebugPanel', + 'flask_debugtoolbar.panels.headers.HeaderDebugPanel', + 'flask_debugtoolbar.panels.request_vars.RequestVarsDebugPanel', + 'flask_debugtoolbar.panels.config_vars.ConfigVarsDebugPanel', + 'flask_debugtoolbar.panels.template.TemplateDebugPanel', + 'flask_debugtoolbar.panels.logger.LoggingPanel', + 'flask_debugtoolbar.panels.profiler.ProfilerDebugPanel', + 'flask_debugtoolbar_lineprofilerpanel.panels.LineProfilerPanel' + ] +) # Initialize the debug toolbar toolbar = DebugToolbarExtension(app) - +# Error handler @app.errorhandler(Exception) def handle_error(e): - utils.log('[ERROR] Returning error ' + str(e) + "\n Traceback:\n" + traceback.format_exc()) - + log.error(f'[ERROR] Returning error {e}\n Traceback:\n{traceback.format_exc()}') if isinstance(e, HTTPException): return jsonify(error=str(e.name + ": " + e.description)), e.code - else: - return jsonify(error=str(type(e).__name__) + str(e)), 500 + return jsonify(error=str(type(e).__name__) + str(e)), 500 @app.before_first_request def startup(): - # Method for running auto indexing def auto_update_index(): + update_interval = int(utils.get_config().get('updateTimeInDays', 0)) * 86400 while True: - time.sleep(int(utils.get_config()['updateTimeInDays']) * 86400) - # if utils.get_config()['autoUpdateIndex'] and utils.get_config()['updateTimeInDays'] > 0: - # utils.log('Updating index automatically. To disable, set the \"autoUpdateIndex\" property in config.json to false.') - # update() + time.sleep(update_interval) + # Implement your update logic here + if utils.get_config().get('autoUpdateIndex', False): + update_index() - # Thread for automatically updaing the index periodically + # Start the background thread for auto-updating the index update_thread = threading.Thread(target=auto_update_index, daemon=True) update_thread.start() - if os.path.exists('log.txt') and os.path.getsize('log.txt') > 20000000: # Delete the log if it is > 20 MB - os.remove('log.txt') - - if os.path.exists('indexing_log.txt') and os.path.getsize('indexing_log.txt') > 20000000: # Delete the log if it is > 20 MB - os.remove('indexing_log.txt') + # Manage log file sizes + for log_file in ['log.txt', 'indexing_log.txt']: + if os.path.exists(log_file) and os.path.getsize(log_file) > 20000000: # 20 MB + os.remove(log_file) utils.log('SBOLExplorer started :)') + # Check and create index if necessary try: - if utils.get_es().indices.exists(index=utils.get_config()['elasticsearch_index_name']) is False: + es = utils.get_es() + index_name = utils.get_config().get('elasticsearch_index_name') + if not es.indices.exists(index=index_name): utils.log('Index not found, creating new index.') - update() - except: + update_index() + except Exception as e: + log.error(f'Error during startup: {e}') raise -@app.errorhandler(Exception) -def handle_error(e): - utils.log('[ERROR] Returning error ' + str(e) + "\n Traceback:\n" + traceback.format_exc()) - return jsonify(error=str(e)), 500 +def update_index(): + utils.log_indexing('============ STARTING INDEXING ============\n\n') + utils.log('============ STARTING INDEXING ============\n\n') + utils.save_update_start_time() + + clusters = cluster.update_clusters() + utils.save_clusters(clusters) + + uri2rank = pagerank.update_pagerank() + utils.save_uri2rank(uri2rank) + + index.update_index(utils.get_uri2rank()) + + query.memoized_query_sparql.cache_clear() + utils.log_indexing('Cache cleared') + + utils.save_update_end_time() + utils.log_indexing('============ INDEXING COMPLETED ============\n\n') + utils.log('============ INDEXING COMPLETED ============\n\n') @app.route('/info', methods=['GET']) def info(): - utils.log('Explorer up!!! Virtutoso ' + str(query.memoized_query_sparql.cache_info())) + utils.log('Explorer up!!! Virtuoso ' + str(query.memoized_query_sparql.cache_info())) return utils.get_log() @app.route('/indexinginfo', methods=['GET']) @@ -107,76 +113,51 @@ def indexinginfo(): return utils.get_indexing_log() @app.route('/config', methods=['POST', 'GET']) -def config(): +def config_route(): if request.method == 'POST': new_config = request.get_json() utils.set_config(new_config) utils.log('Successfully updated config') - config = utils.get_config() - return jsonify(config) - + return jsonify(utils.get_config()) @app.route('/update', methods=['GET']) def update(): try: subject = request.args.get('subject') - - if subject is None: - utils.log_indexing('============ STARTING INDEXING ============\n\n') - utils.log('============ STARTING INDEXING ============\n\n') - utils.save_update_start_time() - - clusters = cluster.update_clusters() - utils.save_clusters(clusters) - - - uri2rank = pagerank.update_pagerank() - utils.save_uri2rank(uri2rank) - - index.update_index(utils.get_uri2rank()) - - query.memoized_query_sparql.cache_clear() - utils.log_indexing('Cache cleared') - - utils.save_update_end_time() - success_message = 'Successfully updated entire index' - else: + if subject: index.refresh_index(subject, utils.get_uri2rank()) - success_message = 'Successfully refreshed: ' + subject - - utils.log_indexing('============ INDEXING COMPLETED ============\n\n') - utils.log('============ INDEXING COMPLETED ============\n\n') + success_message = f'Successfully refreshed: {subject}' + else: + update_index() + success_message = 'Successfully updated entire index' return success_message except Exception as e: - utils.log_indexing('[ERROR] Returning error ' + str(e) + "\n Traceback:\n" + traceback.format_exc()) - + log.error(f'Error during update: {e}') + raise @app.route('/incrementalupdate', methods=['POST']) def incremental_update(): try: updates = request.get_json() - index.incremental_update(updates, utils.get_uri2rank()) - success_message = 'Successfully incrementally updated parts' utils.log(success_message) - return - except: + return success_message + except Exception as e: + log.error(f'Error during incremental update: {e}') raise - @app.route('/incrementalremove', methods=['GET']) def incremental_remove(): try: subject = request.args.get('subject') - index.incremental_remove(subject) - - success_message = 'Successfully incrementally removed: ' + subject + success_message = f'Successfully incrementally removed: {subject}' utils.log(success_message) return success_message - except: + except Exception as e: + log.error(f'Error during incremental remove: {e}') raise @app.route('/incrementalremovecollection', methods=['GET']) @@ -184,13 +165,12 @@ def incremental_remove_collection(): try: subject = request.args.get('subject') uri_prefix = request.args.get('uriPrefix') - index.incremental_remove_collection(subject, uri_prefix) - - success_message = 'Successfully incrementally removed collection and members: ' + subject + success_message = f'Successfully incrementally removed collection and members: {subject}' utils.log(success_message) return success_message - except: + except Exception as e: + log.error(f'Error during incremental remove collection: {e}') raise @app.route('/test', methods=['GET']) @@ -202,44 +182,47 @@ def SBOLExplore_test_endpoint(): @line_profile def sparql_search_endpoint(): try: - # make sure index is built, or throw exception - if utils.get_es().indices.exists(index=utils.get_config()['elasticsearch_index_name']) is False or utils.get_es().cat.indices(format='json')[0]['health'] is 'red': + es = utils.get_es() + index_name = utils.get_config().get('elasticsearch_index_name') + if not es.indices.exists(index=index_name) or es.cat.indices(format='json')[0]['health'] == 'red': abort(503, 'Elasticsearch is not working or the index does not exist.') sparql_query = request.args.get('query') - - if sparql_query is not None: + if sparql_query: default_graph_uri = request.args.get('default-graph-uri') - response = jsonify( - search.search( - sparql_query, - utils.get_uri2rank(), - utils.get_clusters(), - default_graph_uri - )) + response = jsonify(search.search( + sparql_query, + utils.get_uri2rank(), + utils.get_clusters(), + default_graph_uri + )) return response - else: - return "

Welcome to SBOLExplorer!

The available indices in Elasticsearch are shown below:


"\ + return "

Welcome to SBOLExplorer!

The available indices in Elasticsearch are shown below:


"\ + str(utils.get_es().cat.indices(format='json'))\ + "

The config options are set to:


"\ + str(utils.get_config())\ + "



Visit our GitHub repository!"\ + "

Any issues can be reported to our issue tracker."\ + "

Used by SynBioHub." - #return render_template('index.html') - except: + + except Exception as e: + log.error(f'Error during SPARQL search: {e}') raise @app.route('/search', methods=['GET']) def search_by_string(): try: - if utils.get_es().indices.exists(index=utils.get_config()['elasticsearch_index_name']) is False or utils.get_es().cat.indices(format='json')[0]['health'] is 'red': + es = utils.get_es() + index_name = utils.get_config().get('elasticsearch_index_name') + if not es.indices.exists(index=index_name) or es.cat.indices(format='json')[0]['health'] == 'red': abort(503, 'Elasticsearch is not working or the index does not exist.') query = request.args.get('query') - response = jsonify(search.search_es(query)['hits']) - return response - except: + except Exception as e: + log.error(f'Error during search by string: {e}') raise + +if __name__ == "__main__": + app.run(debug=True) From e4b79f36b4f59819ec98a29d57b55db02c53ae67 Mon Sep 17 00:00:00 2001 From: cl117 Date: Tue, 27 Aug 2024 11:05:09 -0600 Subject: [PATCH 04/26] split utils into 5 classes --- flask/cluster.py | 38 +++--- flask/configManager.py | 63 +++++++++ flask/dataManager.py | 76 +++++++++++ flask/elasticsearchManager.py | 17 +++ flask/explorer.py | 86 ++++++------ flask/index.py | 48 ++++--- flask/logger.py | 39 ++++++ flask/pagerank.py | 27 ++-- flask/query.py | 31 +++-- flask/search.py | 32 +++-- flask/sequencesearch.py | 14 +- flask/utils.py | 249 ---------------------------------- flask/wor_client.py | 19 +++ 13 files changed, 369 insertions(+), 370 deletions(-) create mode 100644 flask/configManager.py create mode 100644 flask/dataManager.py create mode 100644 flask/elasticsearchManager.py create mode 100644 flask/logger.py delete mode 100644 flask/utils.py create mode 100644 flask/wor_client.py diff --git a/flask/cluster.py b/flask/cluster.py index 02da210..967a346 100644 --- a/flask/cluster.py +++ b/flask/cluster.py @@ -1,19 +1,21 @@ from xml.etree import ElementTree import subprocess -import utils +from configManager import ConfigManager +from logger import Logger import query from sys import platform - -uclust_identity = utils.get_config()['uclust_identity'] # how similar sequences in the same cluster must be +config_manager = ConfigManager() +uclust_identity = config_manager.load_config()['uclust_identity'] # how similar sequences in the same cluster must be +logger_ = Logger() sequences_filename = 'dumps/sequences.fsa' -if 'which_search' not in utils.get_config(): - explorerConfig = utils.get_config() +if 'which_search' not in config_manager.load_config(): + explorerConfig = config_manager.load_config() explorerConfig['which_search'] = 'vsearch' - utils.set_config(explorerConfig) + config_manager.load_config(explorerConfig) -whichSearch = utils.get_config()['which_search'] +whichSearch = config_manager.load_config()['which_search'] if platform == "linux" or platform == "linux2": if whichSearch == 'usearch': @@ -26,7 +28,7 @@ elif whichSearch == 'vsearch': usearch_binary_filename = 'usearch/vsearch_macos' else: - utils.log("Sorry, your OS is not supported for sequence based-search.") + logger_.log("Sorry, your OS is not supported for sequence based-search.") uclust_results_filename = 'usearch/uclust_results.uc' @@ -56,7 +58,7 @@ def run_uclust(): popen = subprocess.Popen(args, stdout=subprocess.PIPE) popen.wait() output = popen.stdout.read() - utils.log_indexing(str(output)) + logger_.log(str(output), True) def analyze_uclust(): @@ -80,11 +82,11 @@ def analyze_uclust(): hits += 1 f.close() - utils.log_indexing('parts: ' + str(total_parts)) - utils.log_indexing('hits: ' + str(hits)) + logger_.log('parts: ' + str(total_parts), True) + logger_.log('hits: ' + str(hits), True) if hits > 0: - utils.log_indexing('average hit identity: ' + str(total_identity / hits)) + logger_.log('average hit identity: ' + str(total_identity / hits), True) def uclust2uris(fileName): @@ -138,17 +140,17 @@ def uclust2clusters(): def update_clusters(): - utils.log_indexing('------------ Updating clusters ------------') - utils.log_indexing('******** Query for sequences ********') + logger_.log('------------ Updating clusters ------------', True) + logger_.log('******** Query for sequences ********', True) sequences_response = query.query_sparql(sequence_query) - utils.log_indexing('******** Query for sequences complete ********') + logger_.log('******** Query for sequences complete ********', True) write_fasta(sequences_response) - utils.log_indexing('******** Running uclust ********') + logger_.log('******** Running uclust ********', True) run_uclust() - utils.log_indexing('******** Running uclust complete ********') + logger_.log('******** Running uclust complete ********', True) analyze_uclust() - utils.log_indexing('------------ Successsfully updated clusters ------------\n') + logger_.log('------------ Successsfully updated clusters ------------\n', True) return uclust2clusters() diff --git a/flask/configManager.py b/flask/configManager.py new file mode 100644 index 0000000..096ff41 --- /dev/null +++ b/flask/configManager.py @@ -0,0 +1,63 @@ +import json +import datetime + +class ConfigManager: + def __init__(self, config_file='config.json'): + self.config_file = config_file + self._config = None + + def load_config(self): + """ + Gets a copy of the config file + Returns: Config file in JSON + + """ + if self._config is None: + with open(self.config_file) as f: + self._config = json.load(f) + return self._config + + def save_config(self, new_config): + """ + Overwrites the existing config with a new config file + Args: + new_config: New config file with the updated information + Returns: + """ + config = self.load_config() + config.update(new_config) + with open(self.config_file, 'w') as f: + json.dump(config, f) + + def save_time(self, attribute): + """ + Saves the current time to an attribute in the config + Args: + attribute: Config attribute to save current time to + + Returns: + + """ + config = self.load_config() + config[attribute] = datetime.datetime.now().isoformat() + self.save_config(config) + + def get_es_endpoint(self): + return self.load_config().get('elasticsearch_endpoint') + + def save_update_end_time(self): + """ + Save end time of indexing + Returns: + + """ + return self.save_time("last_update_end") + + + def save_update_start_time(self): + """ + Save start time of indexing + Returns: + + """ + return self.save_time("last_update_start") diff --git a/flask/dataManager.py b/flask/dataManager.py new file mode 100644 index 0000000..82e50e0 --- /dev/null +++ b/flask/dataManager.py @@ -0,0 +1,76 @@ +import pickle +import os +class DataManager: + def __init__(self, clusters_filename='dumps/clusters_dump', uri2rank_filename='dumps/uri2rank_dump'): + self.clusters_filename = clusters_filename + self.uri2rank_filename = uri2rank_filename + self._clusters = None + self._uri2rank = None + + def save_clusters(self, clusters): + """ + Save clusters of parts + Args: + new_clusters: Clusters to be saved + + Returns: + + """ + self._clusters = clusters + self._serialize(self._clusters, self.clusters_filename) + + def get_clusters(self): + if self._clusters is None: + self._clusters = self._deserialize(self.clusters_filename) + return self._clusters + + def save_uri2rank(self, uri2rank): + """ + Saves the pagerank of all URI's + Args: + new_uri2rank: + + Returns: + + """ + self._uri2rank = uri2rank + self._serialize(self._uri2rank, self.uri2rank_filename) + + def get_uri2rank(self): + """ + Gets all pageranks of URI's + Returns: + + """ + if self._uri2rank is None: + self._uri2rank = self._deserialize(self.uri2rank_filename) + return self._uri2rank + + @staticmethod + def _serialize(data, filename): + """ + Serializes some data to a file + Args: + data: Data to be written + filename: File to be written to + + Returns: + + """ + with open(filename, 'wb') as f: + pickle.dump(data, f) + + @staticmethod + def _deserialize(filename): + """ + Deserializes data from a serialized file + Args: + filename: Serialized file + + Returns: Deserialized data from file + + """ + if os.path.exists(filename): + with open(filename, 'rb') as f: + return pickle.load(f) + return {} diff --git a/flask/elasticsearchManager.py b/flask/elasticsearchManager.py new file mode 100644 index 0000000..985dc94 --- /dev/null +++ b/flask/elasticsearchManager.py @@ -0,0 +1,17 @@ +from elasticsearch import Elasticsearch + +class ElasticsearchManager: + def __init__(self, config_manager): + self.config_manager = config_manager + self._es = None + + def get_es(self): + """ + Gets an instance of elasticsearch + Returns: The instance of elasticsearch + """ + if self._es is None: + self._es = Elasticsearch([self.config_manager.get_es_endpoint()], verify_certs=True) + if not self._es.ping(): + raise ValueError('Elasticsearch connection failed') + return self._es \ No newline at end of file diff --git a/flask/explorer.py b/flask/explorer.py index c10f8e9..93ae178 100644 --- a/flask/explorer.py +++ b/flask/explorer.py @@ -7,20 +7,28 @@ import logging import threading import time -from flask_debugtoolbar import DebugToolbarExtension -from flask_debugtoolbar_lineprofilerpanel.profile import line_profile - import cluster import pagerank import index import search -import utils import query +from configManager import ConfigManager +from dataManager import DataManager +from elasticsearchManager import ElasticsearchManager +from logger import Logger + +from flask_debugtoolbar import DebugToolbarExtension +from flask_debugtoolbar_lineprofilerpanel.profile import line_profile # Configure logging, This will affect all loggers in your application, not just the Werkzeug logger. log = logging.getLogger('werkzeug') log.setLevel(logging.ERROR) +config_manager = ConfigManager() +data_manager = DataManager() +elasticsearch_manager = ElasticsearchManager(config_manager) +logger_ = Logger() + app = Flask(__name__) app.config.update( SECRET_KEY='your-secret-key', # Required for the debug toolbar @@ -54,11 +62,11 @@ def handle_error(e): @app.before_first_request def startup(): def auto_update_index(): - update_interval = int(utils.get_config().get('updateTimeInDays', 0)) * 86400 + update_interval = int(config_manager.load_config().get('updateTimeInDays', 0)) * 86400 while True: time.sleep(update_interval) # Implement your update logic here - if utils.get_config().get('autoUpdateIndex', False): + if config_manager.load_config().get('autoUpdateIndex', False): update_index() # Start the background thread for auto-updating the index @@ -70,63 +78,61 @@ def auto_update_index(): if os.path.exists(log_file) and os.path.getsize(log_file) > 20000000: # 20 MB os.remove(log_file) - utils.log('SBOLExplorer started :)') + logger_.log('SBOLExplorer started :)') # Check and create index if necessary try: - es = utils.get_es() - index_name = utils.get_config().get('elasticsearch_index_name') + es = elasticsearch_manager.get_es() + index_name = config_manager.load_config().get('elasticsearch_index_name') if not es.indices.exists(index=index_name): - utils.log('Index not found, creating new index.') + logger_.log('Index not found, creating new index.') update_index() except Exception as e: log.error(f'Error during startup: {e}') raise def update_index(): - utils.log_indexing('============ STARTING INDEXING ============\n\n') - utils.log('============ STARTING INDEXING ============\n\n') - utils.save_update_start_time() + logger_.log('============ STARTING INDEXING ============\n\n', True) + config_manager.save_update_start_time() clusters = cluster.update_clusters() - utils.save_clusters(clusters) + data_manager.save_clusters(clusters) uri2rank = pagerank.update_pagerank() - utils.save_uri2rank(uri2rank) + data_manager.save_uri2rank(uri2rank) - index.update_index(utils.get_uri2rank()) + index.update_index(data_manager.get_uri2rank()) query.memoized_query_sparql.cache_clear() - utils.log_indexing('Cache cleared') + logger_.log('Cache cleared', True) - utils.save_update_end_time() - utils.log_indexing('============ INDEXING COMPLETED ============\n\n') - utils.log('============ INDEXING COMPLETED ============\n\n') + config_manager.save_update_end_time() + logger_.log('============ INDEXING COMPLETED ============\n\n', True) @app.route('/info', methods=['GET']) def info(): - utils.log('Explorer up!!! Virtuoso ' + str(query.memoized_query_sparql.cache_info())) - return utils.get_log() + logger_.log('Explorer up!!! Virtuoso ' + str(query.memoized_query_sparql.cache_info())) + return logger_.get_log() @app.route('/indexinginfo', methods=['GET']) def indexinginfo(): - return utils.get_indexing_log() + return logger_.get_indexing_log() @app.route('/config', methods=['POST', 'GET']) def config_route(): if request.method == 'POST': new_config = request.get_json() - utils.set_config(new_config) - utils.log('Successfully updated config') + config_manager.save_config(new_config) + logger_.log('Successfully updated config') - return jsonify(utils.get_config()) + return jsonify(config_manager.load_config()) @app.route('/update', methods=['GET']) def update(): try: subject = request.args.get('subject') if subject: - index.refresh_index(subject, utils.get_uri2rank()) + index.refresh_index(subject, data_manager.get_uri2rank()) success_message = f'Successfully refreshed: {subject}' else: update_index() @@ -140,9 +146,9 @@ def update(): def incremental_update(): try: updates = request.get_json() - index.incremental_update(updates, utils.get_uri2rank()) + index.incremental_update(updates, data_manager.get_uri2rank()) success_message = 'Successfully incrementally updated parts' - utils.log(success_message) + logger_.log(success_message) return success_message except Exception as e: log.error(f'Error during incremental update: {e}') @@ -154,7 +160,7 @@ def incremental_remove(): subject = request.args.get('subject') index.incremental_remove(subject) success_message = f'Successfully incrementally removed: {subject}' - utils.log(success_message) + logger_.log(success_message) return success_message except Exception as e: log.error(f'Error during incremental remove: {e}') @@ -167,7 +173,7 @@ def incremental_remove_collection(): uri_prefix = request.args.get('uriPrefix') index.incremental_remove_collection(subject, uri_prefix) success_message = f'Successfully incrementally removed collection and members: {subject}' - utils.log(success_message) + logger_.log(success_message) return success_message except Exception as e: log.error(f'Error during incremental remove collection: {e}') @@ -182,8 +188,8 @@ def SBOLExplore_test_endpoint(): @line_profile def sparql_search_endpoint(): try: - es = utils.get_es() - index_name = utils.get_config().get('elasticsearch_index_name') + es = elasticsearch_manager.get_es() + index_name = config_manager.load_config().get('elasticsearch_index_name') if not es.indices.exists(index=index_name) or es.cat.indices(format='json')[0]['health'] == 'red': abort(503, 'Elasticsearch is not working or the index does not exist.') @@ -192,15 +198,15 @@ def sparql_search_endpoint(): default_graph_uri = request.args.get('default-graph-uri') response = jsonify(search.search( sparql_query, - utils.get_uri2rank(), - utils.get_clusters(), + data_manager.get_uri2rank(), + data_manager.get_clusters(), default_graph_uri )) return response return "

Welcome to SBOLExplorer!

The available indices in Elasticsearch are shown below:


"\ - + str(utils.get_es().cat.indices(format='json'))\ + + str(elasticsearch_manager.get_es().cat.indices(format='json'))\ + "

The config options are set to:


"\ - + str(utils.get_config())\ + + str(config_manager.load_config())\ + "



Visit our GitHub repository!"\ + "

Any issues can be reported to our issue tracker."\ + "

Used by SynBioHub." @@ -212,8 +218,8 @@ def sparql_search_endpoint(): @app.route('/search', methods=['GET']) def search_by_string(): try: - es = utils.get_es() - index_name = utils.get_config().get('elasticsearch_index_name') + es = elasticsearch_manager.get_es() + index_name = config_manager.load_config().get('elasticsearch_index_name') if not es.indices.exists(index=index_name) or es.cat.indices(format='json')[0]['health'] == 'red': abort(503, 'Elasticsearch is not working or the index does not exist.') @@ -225,4 +231,4 @@ def search_by_string(): raise if __name__ == "__main__": - app.run(debug=True) + app.run(debug=True) # threaded=True diff --git a/flask/index.py b/flask/index.py index cb38309..0762a72 100644 --- a/flask/index.py +++ b/flask/index.py @@ -1,7 +1,13 @@ from elasticsearch import helpers -import utils +from configManager import ConfigManager +from elasticsearchManager import ElasticsearchManager import query import json +from logger import Logger + +config_manager = ConfigManager() +elasticsearch_manager = ElasticsearchManager(config_manager) +logger_ = Logger() def add_pagerank(parts_response, uri2rank): """ @@ -94,9 +100,9 @@ def create_parts_index(index_name): index_name {String} -- Name of the new index """ - if utils.get_es().indices.exists(index_name): - utils.log_indexing('Index already exists -> deleting') - utils.get_es().indices.delete(index=index_name) + if elasticsearch_manager.get_es().indices.exists(index_name): + logger_.log('Index already exists -> deleting', True) + elasticsearch_manager.get_es().indices.delete(index=index_name) body = { 'mappings': { @@ -116,8 +122,8 @@ def create_parts_index(index_name): } } - utils.get_es().indices.create(index=index_name, body=body) - utils.log_indexing('Index created') + elasticsearch_manager.get_es().indices.create(index=index_name, body=body) + logger_.log('Index created', True) def bulk_index_parts(parts_response, index_name): @@ -143,12 +149,12 @@ def bulk_index_parts(parts_response, index_name): actions.append(action) - utils.log_indexing('Bulk indexing') + logger_.log('Bulk indexing', True) try: - stats = helpers.bulk(utils.get_es(), actions) - utils.log_indexing('Bulk indexing complete') + stats = helpers.bulk(elasticsearch_manager.get_es(), actions) + logger_.log('Bulk indexing complete', True) except: - utils.log_indexing('[ERROR] Error_messages: ' + '\n'.join(stats[1])) + logger_.log('[ERROR] Error_messages: ' + '\n'.join(stats[1]), True) raise Exception("Bulk indexing failed") def update_index(uri2rank): @@ -160,15 +166,15 @@ def update_index(uri2rank): Returns: """ - index_name = utils.get_config()['elasticsearch_index_name'] + index_name = config_manager.load_config()['elasticsearch_index_name'] - utils.log_indexing('------------ Updating index ------------') + logger_.log('------------ Updating index ------------', True) - utils.log_indexing('******** Query for parts ********') + logger_.log('******** Query for parts ********', True) parts_response = query.query_parts(indexing = True) - utils.log_indexing('******** Query for parts complete ********') + logger_.log('******** Query for parts complete ********', True) - utils.log_indexing('******** Adding parts to new index ********') + logger_.log('******** Adding parts to new index ********', True) add_pagerank(parts_response, uri2rank) add_keywords(parts_response) add_roles(parts_response) @@ -176,9 +182,9 @@ def update_index(uri2rank): create_parts_index(index_name) bulk_index_parts(parts_response, index_name) - utils.log_indexing('******** Finished adding ' + str(len(parts_response)) + ' parts to index ********') + logger_.log('******** Finished adding ' + str(len(parts_response)) + ' parts to index ********', True) - utils.log_indexing('------------ Successfully updated index ------------\n') + logger_.log('------------ Successfully updated index ------------\n', True) def delete_subject(subject): @@ -190,7 +196,7 @@ def delete_subject(subject): Returns: """ - index_name = utils.get_config()['elasticsearch_index_name'] + index_name = config_manager.load_config()['elasticsearch_index_name'] body = { 'query': { @@ -202,13 +208,13 @@ def delete_subject(subject): }, 'conflicts': 'proceed' } - utils.get_es().delete_by_query(index=index_name, doc_type=index_name, body=body) + elasticsearch_manager.get_es().delete_by_query(index=index_name, doc_type=index_name, body=body) def index_part(part): delete_subject(part['subject']) - index_name = utils.get_config()['elasticsearch_index_name'] - utils.get_es().index(index=index_name, doc_type=index_name, id=part['subject'], body=part) + index_name = config_manager.load_config()['elasticsearch_index_name'] + elasticsearch_manager.get_es().index(index=index_name, doc_type=index_name, id=part['subject'], body=part) def refresh_index(subject, uri2rank): diff --git a/flask/logger.py b/flask/logger.py new file mode 100644 index 0000000..00c7259 --- /dev/null +++ b/flask/logger.py @@ -0,0 +1,39 @@ +import datetime +import os +class Logger: + def __init__(self, log_file='log.txt', indexing_log_file='indexing_log.txt'): + self.log_file = log_file + self.indexing_log_file = indexing_log_file + + def log(self, message, to_indexing_log=False): + """ + Writes a message to the log + Args: + message: Message to write + + Returns: + """ + log_message = f'[{datetime.datetime.now().isoformat()}] {message}\n' + print(log_message, end='') # Output to console + + file = self.indexing_log_file if to_indexing_log else self.log_file + with open(file, 'a+') as f: + f.write(log_message) + + def get_log(self): + """ + Gets a copy of the log + Returns: Stream from the read() method + + """ + return self._read_file(self.log_file) + + def get_indexing_log(self): + return self._read_file(self.indexing_log_file) + + @staticmethod + def _read_file(filename): + if os.path.exists(filename): + with open(filename, 'r') as f: + return f.read() + return "" diff --git a/flask/pagerank.py b/flask/pagerank.py index 816c9dd..2b319f1 100644 --- a/flask/pagerank.py +++ b/flask/pagerank.py @@ -1,8 +1,11 @@ from xml.etree import ElementTree import numpy as np -import utils import query +from logger import Logger +from configManager import ConfigManager +config_manager = ConfigManager() +logger_ = Logger() link_query = ''' SELECT DISTINCT ?parent ?child @@ -116,7 +119,7 @@ def pagerank(g, s=0.85, tolerance=0.001): if n == 0: - utils.log_indexing('no iterations: empty graph') + logger_.log('no iterations: empty graph', True) return p iteration = 1 @@ -134,7 +137,7 @@ def pagerank(g, s=0.85, tolerance=0.001): new_p = v / np.sum(v) delta = np.sum(np.abs(p - new_p)) - utils.log_indexing('Iteration ' + str(iteration) + ': L1 norm delta is ' + str(delta)) + logger_.log('Iteration ' + str(iteration) + ': L1 norm delta is ' + str(delta), True) p = new_p iteration += 1 @@ -155,22 +158,22 @@ def make_uri2rank(pr_vector, uri2index): def update_pagerank(): - utils.log_indexing('------------ Updating pagerank ------------') - utils.log_indexing('******** Query for uris ********') + logger_.log('------------ Updating pagerank ------------', True) + logger_.log('******** Query for uris ********', True) uri_response = query.query_sparql(uri_query) - utils.log_indexing('******** Query for uris complete ********') + logger_.log('******** Query for uris complete ********', True) adjacency_list = populate_uris(uri_response) - utils.log_indexing('******** Query for links ********') + logger_.log('******** Query for links ********', True) link_response = query.query_sparql(link_query) - utils.log_indexing('******** Query for links complete ********') + logger_.log('******** Query for links complete ********', True) populate_links(link_response, adjacency_list) g = graph(adjacency_list) - utils.log_indexing('******** Running pagerank ********') - pr = pagerank(g, tolerance=float(utils.get_config()['pagerank_tolerance'])) - utils.log_indexing('******** Running pagerank complete ********') - utils.log_indexing('------------ Successfully updated pagerank ------------\n') + logger_.log('******** Running pagerank ********', True) + pr = pagerank(g, tolerance=float(config_manager.load_config()['pagerank_tolerance'])) + logger_.log('******** Running pagerank complete ********', True) + logger_.log('------------ Successfully updated pagerank ------------\n', True) pr_vector = np.squeeze(np.asarray(pr)) # after squeeze, make sure it at least has a dimension in the case that there is only one element diff --git a/flask/query.py b/flask/query.py index 75c53d1..e7a54b8 100755 --- a/flask/query.py +++ b/flask/query.py @@ -2,8 +2,15 @@ import urllib.parse from functools import lru_cache import json -import utils +from wor_client import WORClient import re +from configManager import ConfigManager +from logger import Logger + +config_manager = ConfigManager() +logger_ = Logger() +wor_client_ = WORClient() + def query_parts(_from = '', criteria = '', indexing = False): @@ -67,10 +74,10 @@ def query_sparql(query): Returns: """ - endpoints = [utils.get_config()['sparql_endpoint']] + endpoints = [config_manager.load_config()['sparql_endpoint']] - if utils.get_config()['distributed_search']: - instances = utils.get_wor() + if config_manager.load_config()['distributed_search']: + instances = wor_client_.get_wor_instance() for instance in instances: endpoints.append(instance['instanceUrl'] + '/sparql?') @@ -80,7 +87,7 @@ def query_sparql(query): try: results.extend(page_query(query, endpoint)) except: - utils.log('[ERROR] failed querying:' + endpoint) + logger_.log('[ERROR] failed querying:' + endpoint) raise Exception("Endpoint not responding") return deduplicate_results(results) @@ -113,7 +120,7 @@ def page_query(query, endpoint): Returns: List of parts """ - utils.log('Current endpoint: ' + endpoint) + logger_.log('Current endpoint: ' + endpoint) bar = [ "[ ]", @@ -136,7 +143,7 @@ def page_query(query, endpoint): ] bar_counter = 0 - if endpoint != utils.get_config()['sparql_endpoint']: + if endpoint != config_manager.load_config()['sparql_endpoint']: query = re.sub(r'''FROM.*\n''', '', query) query_prefix = ''' @@ -187,8 +194,8 @@ def send_query(query, endpoint): """ params = {'query': query} - if endpoint == utils.get_config()['sparql_endpoint']: - params['default-graph-uri'] = '' # utils.get_config()['synbiohub_public_graph'] + if endpoint == config_manager.load_config()['sparql_endpoint']: + params['default-graph-uri'] = '' # config_manager.load_config()['synbiohub_public_graph'] url = endpoint + urllib.parse.urlencode(params) headers = {'Accept': 'application/json'} @@ -196,12 +203,12 @@ def send_query(query, endpoint): try: r = requests.get(url, headers=headers) except Exception as e: - utils.log("[ERROR] exception when connecting: " + str(e)) + logger_.log("[ERROR] exception when connecting: " + str(e)) raise Exception("Local SynBioHub isn't responding") if r.status_code != 200: - utils.log('[ERROR] Got status code when querying: ' + str(r.status_code)) - utils.log(r.text) + logger_.log('[ERROR] Got status code when querying: ' + str(r.status_code)) + logger_.log(r.text) raise Exception(url + ' is not responding') results = [] diff --git a/flask/search.py b/flask/search.py index 4d316db..d772b8f 100644 --- a/flask/search.py +++ b/flask/search.py @@ -1,8 +1,16 @@ import re from typing import List, Dict, Tuple, Optional -import utils import query import sequencesearch +from wor_client import WORClient +from elasticsearchManager import ElasticsearchManager +from configManager import ConfigManager +from logger import Logger + +config_manager = ConfigManager() +elasticsearch_manager = ElasticsearchManager(config_manager) +logger_ = Logger() +wor_client_ = WORClient() # Compile regex patterns FROM_COUNT_PATTERN = re.compile(r'SELECT \(count\(distinct \?subject\) as \?tempcount\)\s*(.*)\s*WHERE {') @@ -53,9 +61,9 @@ def search_es(es_query: str) -> Dict: 'size': 10000 } try: - return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) + return elasticsearch_manager.get_es().search(index=config_manager.load_config()['elasticsearch_index_name'], body=body) except Exception as e: - utils.log(f"ES search failed: {e}") + logger_.log(f"ES search failed: {e}") raise def empty_search_es(offset: int, limit: int, allowed_graphs: List[str]) -> Dict: @@ -79,9 +87,9 @@ def empty_search_es(offset: int, limit: int, allowed_graphs: List[str]) -> Dict: 'size': limit } try: - return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) + return elasticsearch_manager.get_es().search(index=config_manager.load_config()['elasticsearch_index_name'], body=body) except Exception as e: - utils.log(f"ES search failed: {e}") + logger_.log(f"ES search failed: {e}") raise def search_es_allowed_subjects(es_query: str, allowed_subjects: List[str]) -> Dict: @@ -123,9 +131,9 @@ def search_es_allowed_subjects(es_query: str, allowed_subjects: List[str]) -> Di 'size': 10000 } try: - return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) + return elasticsearch_manager.get_es().search(index=config_manager.load_config()['elasticsearch_index_name'], body=body) except Exception as e: - utils.log(f"ES search failed: {e}") + logger_.log(f"ES search failed: {e}") raise def search_es_allowed_subjects_empty_string(allowed_subjects: List[str]) -> Dict: @@ -153,9 +161,9 @@ def search_es_allowed_subjects_empty_string(allowed_subjects: List[str]) -> Dict 'size': 10000 } try: - return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) + return elasticsearch_manager.get_es().search(index=config_manager.load_config()['elasticsearch_index_name'], body=body) except Exception as e: - utils.log(f"ES search failed: {e}") + logger_.log(f"ES search failed: {e}") raise def parse_sparql_query(sparql_query, is_count_query): # Find FROM clause @@ -208,8 +216,8 @@ def extract_allowed_graphs(_from: str, default_graph_uri: str) -> List[str]: Extracts the allowed graphs to search over. """ allowed_graphs = [default_graph_uri] if not _from else [graph.strip()[1:-1] for graph in _from.split('FROM') if graph.strip()] - if utils.get_config()['distributed_search']: - allowed_graphs.extend(instance['instanceUrl'] + '/public' for instance in utils.get_wor()) + if config_manager.load_config()['distributed_search']: + allowed_graphs.extend(instance['instanceUrl'] + '/public' for instance in wor_client_.get_wor_instance()) return allowed_graphs def is_count_query(sparql_query: str) -> bool: @@ -510,7 +518,7 @@ def search(sparql_query, uri2rank, clusters, default_graph_uri): else search_es_allowed_subjects(es_query, allowed_subjects)) bindings = create_bindings(es_allowed_subject, clusters, allowed_graphs, allowed_subjects) - utils.log('Advanced string search complete.') + logger_.log('Advanced string search complete.') bindings.sort(key=lambda b: b['order_by'], reverse=True) return create_response(len(bindings), bindings[offset:offset + limit], is_count_query(sparql_query)) diff --git a/flask/sequencesearch.py b/flask/sequencesearch.py index 6cb899b..89d6964 100644 --- a/flask/sequencesearch.py +++ b/flask/sequencesearch.py @@ -1,12 +1,14 @@ from xml.etree import ElementTree import subprocess -import utils import query import cluster import search from sys import platform import base64 import tempfile +from logger import Logger + +logger_ = Logger() # handling selection of VSEARCH binary @@ -15,7 +17,7 @@ elif platform == "darwin": vsearch_binary_filename = 'usearch/vsearch_macos' else: - utils.log("Sorry, your OS is not supported for sequence based-search.") + logger_.log("Sorry, your OS is not supported for sequence based-search.") # add valid flags to here globalFlags = {'maxaccepts': '50', 'id': '0.8', 'iddef': '2', 'maxrejects': '0', 'maxseqlength': '5000', 'minseqlength': '20'} @@ -51,7 +53,7 @@ def run_vsearch_global(fileName): popen = subprocess.Popen(args, stdout=subprocess.PIPE) popen.wait() output = popen.stdout.read() - utils.log(output) + logger_.log(output) def run_vsearch_exact(fileName): """ @@ -66,7 +68,7 @@ def run_vsearch_exact(fileName): popen = subprocess.Popen(args, stdout=subprocess.PIPE) popen.wait() output = popen.stdout.read() - utils.log(output) + logger_.log(output) def append_flags_to_args(argsList, flags): @@ -124,7 +126,7 @@ def sequence_search(userFlags, fileName): Returns: set -- search results by URI """ - utils.log('Starting sequence search') + logger_.log('Starting sequence search') if "search_exact" in userFlags: add_exact_flags(userFlags) @@ -132,7 +134,7 @@ def sequence_search(userFlags, fileName): else: add_global_flags(userFlags) run_vsearch_global(fileName) - utils.log('Sequence search complete') + logger_.log('Sequence search complete') return cluster.uclust2uris(fileName[:-4] + '.uc') diff --git a/flask/utils.py b/flask/utils.py deleted file mode 100644 index db0efc8..0000000 --- a/flask/utils.py +++ /dev/null @@ -1,249 +0,0 @@ -from elasticsearch import Elasticsearch -import json -import pickle -import requests -import datetime -import os - -config = None - -def get_config(): - """ - Gets a copy of the config file - Returns: Config file in JSON - - """ - global config - - if not config: - with open('config.json') as f: - config = json.load(f) - - return config - - -def set_config(new_config): - """ - Overwrites the existing config with a new config file - Args: - new_config: New config file with the updated information - - Returns: - - """ - global config - - config = get_config() - - for key in new_config: - if key in config: - config[key] = new_config[key] - - with open('config.json', 'w') as f: - json.dump(config, f) - - -def save_time(attribute): - """ - Saves the current time to an attribute in the config - Args: - attribute: Config attribute to save current time to - - Returns: - - """ - config = get_config() - - now = datetime.datetime.now() - - config[attribute] = str(now) - - set_config(config) - -def save_update_end_time(): - """ - Save end time of indexing - Returns: - - """ - save_time("last_update_end") - - -def save_update_start_time(): - """ - Save start time of indexing - Returns: - - """ - save_time("last_update_start") - - -def get_wor(): - """ - Gets all instances of SynBioHub from the Web of Registries - Returns: - - """ - try: - instances = requests.get('https://wor.synbiohub.org/instances/') - except Exception: - log('[ERROR] Web of Registries had a problem!') - return [] - - if instances.status_code != 200: - log('[ERROR] Web of Registries had a problem!') - return [] - - return instances.json() - - -def get_es(): - """ - Gets an instance of elasticsearch - Returns: The instance of elasticsearch - - """ - es = Elasticsearch([get_config()['elasticsearch_endpoint']], verify_certs=True) - - if not es.ping(): - raise ValueError('Elasticsearch connection failed') - - return es - - -def log(message): - """ - Writes a message to the log - Args: - message: Message to write - - Returns: - - """ - log_message = '[' + str(datetime.datetime.now()) + '] ' + str(message) + '\n' - print(log_message) - - with open('log.txt', 'a+') as f: - f.write(log_message) - -def log_indexing(message): - log_message = '[' + str(datetime.datetime.now()) + '] ' + str(message) + '\n' - print(log_message) - - with open('indexing_log.txt', 'a+') as f: - f.write(log_message) - -def get_log(): - """ - Gets a copy of the log - Returns: Stream from the read() method - - """ - try: - with open('log.txt', 'r') as f: - return f.read() - except: - return "" - -def get_indexing_log(): - try: - with open('indexing_log.txt', 'r') as f: - return f.read() - except: - return "" - -clusters = None -clusters_filename = 'dumps/clusters_dump' - -uri2rank = None -uri2rank_filename = 'dumps/uri2rank_dump' - - -def save_clusters(new_clusters): - """ - Save clusters of parts - Args: - new_clusters: Clusters to be saved - - Returns: - - """ - global clusters - clusters = new_clusters - serialize(clusters, clusters_filename) - - -def get_clusters(): - """ - Gets all clusters of parts - Returns: - - """ - global clusters - - if clusters is None: - clusters = deserialize(clusters_filename) - - return clusters - - -def save_uri2rank(new_uri2rank): - """ - Saves the pagerank of all URI's - Args: - new_uri2rank: - - Returns: - - """ - global uri2rank - uri2rank = new_uri2rank - serialize(uri2rank, uri2rank_filename) - - -def get_uri2rank(): - """ - Gets all pageranks of URI's - Returns: - - """ - global uri2rank - - if uri2rank is None: - uri2rank = deserialize(uri2rank_filename) - - return uri2rank - - -def serialize(data, filename): - """ - Serializes some data to a file - Args: - data: Data to be written - filename: File to be written to - - Returns: - - """ - f = open(filename, 'wb') - pickle.dump(data, f) - f.close() - - -def deserialize(filename): - """ - Deserializes data from a serialized file - Args: - filename: Serialized file - - Returns: Deserialized data from file - - """ - if not os.path.exists(filename): - return {} - - f = open(filename, 'rb') - data = pickle.load(f) - f.close() - return data - \ No newline at end of file diff --git a/flask/wor_client.py b/flask/wor_client.py new file mode 100644 index 0000000..3f0f607 --- /dev/null +++ b/flask/wor_client.py @@ -0,0 +1,19 @@ +import requests +from logger import Logger + +logger_ = Logger() +class WORClient: + @staticmethod + def get_wor_instances(): + """ + Gets all instances of SynBioHub from the Web of Registries + Returns: + + """ + try: + response = requests.get('https://wor.synbiohub.org/instances/') + response.raise_for_status() + return response.json() + except requests.RequestException: + logger_.log('[ERROR] Web of Registries had a problem!') + return [] \ No newline at end of file From 5605f3e507de93467ec1a164e86853e5320e6e52 Mon Sep 17 00:00:00 2001 From: cl117 Date: Tue, 27 Aug 2024 12:04:44 -0600 Subject: [PATCH 05/26] optimize cluster.py --- flask/cluster.py | 139 ++++++++++++++++------------------------------- 1 file changed, 47 insertions(+), 92 deletions(-) diff --git a/flask/cluster.py b/flask/cluster.py index 967a346..97332ce 100644 --- a/flask/cluster.py +++ b/flask/cluster.py @@ -6,29 +6,27 @@ from sys import platform config_manager = ConfigManager() -uclust_identity = config_manager.load_config()['uclust_identity'] # how similar sequences in the same cluster must be +config = config_manager.load_config() # Load config once +uclust_identity = config['uclust_identity'] # Get the uclust identity value logger_ = Logger() sequences_filename = 'dumps/sequences.fsa' -if 'which_search' not in config_manager.load_config(): - explorerConfig = config_manager.load_config() - explorerConfig['which_search'] = 'vsearch' - config_manager.load_config(explorerConfig) +# Ensure 'which_search' is set in config +if 'which_search' not in config: + config['which_search'] = 'vsearch' + config_manager.save_config(config) -whichSearch = config_manager.load_config()['which_search'] +whichSearch = config['which_search'] -if platform == "linux" or platform == "linux2": - if whichSearch == 'usearch': - usearch_binary_filename = 'usearch/usearch10.0.240_i86linux32' - elif whichSearch == 'vsearch': - usearch_binary_filename = 'usearch/vsearch_linux' +# Determine the correct binary filename based on OS and search tool +usearch_binary_filename = None +if platform.startswith("linux"): + usearch_binary_filename = 'usearch/vsearch_linux' if whichSearch == 'vsearch' else 'usearch/usearch10.0.240_i86linux32' elif platform == "darwin": - if whichSearch == 'usearch': - usearch_binary_filename = 'usearch/usearch11.0.667_i86osx32' - elif whichSearch == 'vsearch': - usearch_binary_filename = 'usearch/vsearch_macos' + usearch_binary_filename = 'usearch/vsearch_macos' if whichSearch == 'vsearch' else 'usearch/usearch11.0.667_i86osx32' else: - logger_.log("Sorry, your OS is not supported for sequence based-search.") + logger_.log("Sorry, your OS is not supported for sequence-based search.") + raise SystemExit uclust_results_filename = 'usearch/uclust_results.uc' @@ -42,103 +40,61 @@ } ''' - def write_fasta(sequences): - f = open(sequences_filename, 'w') - - for sequence in sequences: - f.write('>%s\n' % sequence['subject']) - f.write('%s\n' % sequence['sequence']) - - f.close() - + with open(sequences_filename, 'w') as f: + for sequence in sequences: + f.write(f">{sequence['subject']}\n{sequence['sequence']}\n") def run_uclust(): args = [usearch_binary_filename, '-cluster_fast', sequences_filename, '-id', uclust_identity, '-sort', 'length', '-uc', uclust_results_filename] - popen = subprocess.Popen(args, stdout=subprocess.PIPE) - popen.wait() - output = popen.stdout.read() - logger_.log(str(output), True) - + result = subprocess.run(args, capture_output=True, text=True) + logger_.log(result.stdout, True) def analyze_uclust(): - f = open(uclust_results_filename, 'r') - results = f.read() - total_parts = 0 total_identity = 0.0 hits = 0 - lines = results.splitlines() - for line in lines: - line = line.split() - record_type = line[0] - - if record_type in ('H', 'S'): - total_parts += 1 - - if line[0] is 'H': - total_identity += float(line[3]) - hits += 1 - - f.close() - logger_.log('parts: ' + str(total_parts), True) - logger_.log('hits: ' + str(hits), True) - + with open(uclust_results_filename, 'r') as f: + for line in f: + parts = line.split() + record_type = parts[0] + if record_type in ('H', 'S'): + total_parts += 1 + if record_type == 'H': + total_identity += float(parts[3]) + hits += 1 + + logger_.log(f'parts: {total_parts}', True) + logger_.log(f'hits: {hits}', True) if hits > 0: - logger_.log('average hit identity: ' + str(total_identity / hits), True) - + logger_.log(f'average hit identity: {total_identity / hits}', True) def uclust2uris(fileName): uris = set() - - f = open(fileName, 'r') - results = f.read() - lines = results.splitlines() - - for line in lines: - line = line.split() - - if line[0] is 'H': - partURI = line[9] - - uris.add(partURI) - - f.close() - + with open(fileName, 'r') as f: + for line in f: + parts = line.split() + if parts[0] == 'H': + uris.add(parts[9]) return uris def uclust2clusters(): - # populate cluster2parts cluster2parts = {} - f = open(uclust_results_filename, 'r') - results = f.read() - lines = results.splitlines() - - for line in lines: - line = line.split() - - if line[0] is 'H' or line[0] is 'S': - part, cluster = line[8], line[1] + with open(uclust_results_filename, 'r') as f: + for line in f: + parts = line.split() + if parts[0] in ('H', 'S'): + part, cluster = parts[8], parts[1] + if cluster not in cluster2parts: + cluster2parts[cluster] = set() + cluster2parts[cluster].add(part) - if cluster not in cluster2parts: - cluster2parts[cluster] = set() - cluster2parts[cluster].add(part) - - f.close() - - # transform cluster2parts to clusters - clusters = {} - - for cluster in cluster2parts: - parts = cluster2parts[cluster] - for part in parts: - clusters[part] = parts.difference({part}) + clusters = {part: parts.difference({part}) for cluster, parts in cluster2parts.items() for part in parts} return clusters - def update_clusters(): logger_.log('------------ Updating clusters ------------', True) logger_.log('******** Query for sequences ********', True) @@ -151,6 +107,5 @@ def update_clusters(): logger_.log('******** Running uclust complete ********', True) analyze_uclust() - logger_.log('------------ Successsfully updated clusters ------------\n', True) + logger_.log('------------ Successfully updated clusters ------------\n', True) return uclust2clusters() - From 0418a64789ae8a8b090a52fb8ca0d23eb9b4fa28 Mon Sep 17 00:00:00 2001 From: cl117 Date: Tue, 27 Aug 2024 14:52:04 -0600 Subject: [PATCH 06/26] optimize query.py --- flask/query.py | 150 +++++++++++++++++-------------------------------- 1 file changed, 53 insertions(+), 97 deletions(-) mode change 100755 => 100644 flask/query.py diff --git a/flask/query.py b/flask/query.py old mode 100755 new mode 100644 index e7a54b8..9f00e1b --- a/flask/query.py +++ b/flask/query.py @@ -1,30 +1,29 @@ import requests import urllib.parse from functools import lru_cache -import json from wor_client import WORClient import re from configManager import ConfigManager from logger import Logger +# Load config once and reuse config_manager = ConfigManager() +config = config_manager.load_config() + logger_ = Logger() wor_client_ = WORClient() - - -def query_parts(_from = '', criteria = '', indexing = False): +def query_parts(_from='', criteria='', indexing=False): """ - Gets all parts from Virtuoso + Gets all parts from Virtuoso. Args: _from: Graph the parts are from criteria: Any additional criteria indexing: Whether this query is being called during indexing Returns: Formatted list of all parts from Virtuoso - """ - query = ''' + query = f''' SELECT DISTINCT ?subject ?displayId @@ -35,117 +34,85 @@ def query_parts(_from = '', criteria = '', indexing = False): ?graph ?role ?sboltype - ''' + _from + ''' - WHERE { - ''' + criteria + ''' + {_from} + WHERE {{ + {criteria} ?subject a ?type . - ?subject sbh:topLevel ?subject .''' + ('''\n GRAPH ?graph { ?subject ?a ?t } .''' if indexing else "") + ''' - OPTIONAL { ?subject sbol2:displayId ?displayId . } - OPTIONAL { ?subject sbol2:version ?version . } - OPTIONAL { ?subject dcterms:title ?name . } - OPTIONAL { ?subject dcterms:description ?description . } - OPTIONAL { ?subject sbol2:role ?role . } - OPTIONAL { ?subject sbol2:type ?sboltype . } - } + ?subject sbh:topLevel ?subject . + {("GRAPH ?graph { ?subject ?a ?t } ." if indexing else "")} + OPTIONAL {{ ?subject sbol2:displayId ?displayId . }} + OPTIONAL {{ ?subject sbol2:version ?version . }} + OPTIONAL {{ ?subject dcterms:title ?name . }} + OPTIONAL {{ ?subject dcterms:description ?description . }} + OPTIONAL {{ ?subject sbol2:role ?role . }} + OPTIONAL {{ ?subject sbol2:type ?sboltype . }} + }} ''' - return memoized_query_sparql(query) - @lru_cache(maxsize=32) def memoized_query_sparql(query): """ - Speeds up SPARQL queries using a LRU cache + Speeds up SPARQL queries using a LRU cache. Args: query: SPARQL Query Returns: Results of the SPARQL query - """ return query_sparql(query) - def query_sparql(query): """ - Query instances of Virtuoso + Query instances of Virtuoso. Args: query: SPARQL query - Returns: - + Returns: Deduplicated results of the SPARQL query """ - endpoints = [config_manager.load_config()['sparql_endpoint']] + endpoints = [config['sparql_endpoint']] - if config_manager.load_config()['distributed_search']: + if config.get('distributed_search'): instances = wor_client_.get_wor_instance() - for instance in instances: - endpoints.append(instance['instanceUrl'] + '/sparql?') + endpoints.extend(instance['instanceUrl'] + '/sparql?' for instance in instances) results = [] for endpoint in endpoints: try: results.extend(page_query(query, endpoint)) - except: - logger_.log('[ERROR] failed querying:' + endpoint) - raise Exception("Endpoint not responding") + except Exception as e: + logger_.log(f'[ERROR] failed querying: {endpoint} - {str(e)}') + continue return deduplicate_results(results) - def deduplicate_results(results): """ - Removes duplicates from all SPARQL queries to various Virtuoso instances + Removes duplicates from all SPARQL queries to various Virtuoso instances. Args: results: List of results which may contain duplicates Returns: Deduplicated list of results - """ - deduped = set() - + seen = set() + deduped = [] for result in results: - deduped.add(json.dumps(result, sort_keys=True)) - - return [json.loads(result) for result in deduped] - + result_tuple = tuple(sorted(result.items())) + if result_tuple not in seen: + seen.add(result_tuple) + deduped.append(result) + return deduped def page_query(query, endpoint): """ - Query to get results from a particular page in SynBioHub + Query to get results from a particular page in SynBioHub. Args: query: Query to run endpoint: Virtuoso endpoint to hit Returns: List of parts - """ - logger_.log('Current endpoint: ' + endpoint) - - bar = [ - "[ ]", - "[= ]", - "[=== ]", - "[==== ]", - "[===== ]", - "[====== ]", - "[======= ]", - "[========]", - "[ =======]", - "[ ======]", - "[ =====]", - "[ ====]", - "[ ===]", - "[ ==]", - "[ =]", - "[ ]", - "[ ]" - ] - bar_counter = 0 - - if endpoint != config_manager.load_config()['sparql_endpoint']: - query = re.sub(r'''FROM.*\n''', '', query) - + logger_.log(f'Current endpoint: {endpoint}') query_prefix = ''' PREFIX rdf: PREFIX dcterms: @@ -163,61 +130,50 @@ def page_query(query, endpoint): offset = 0 limit = 10000 - results = [] - while True: - print(bar[bar_counter % len(bar)], end="\r") - bar_counter+= 1 + if endpoint != config['sparql_endpoint']: + query = re.sub(r'''FROM.*\n''', '', query) - full_query = query_prefix + query + 'OFFSET ' + str(offset) + ' LIMIT ' + str(limit) + while True: + full_query = f"{query_prefix} {query} OFFSET {offset} LIMIT {limit}" new_results = send_query(full_query, endpoint) results.extend(new_results) - if len(new_results) != limit: + if len(new_results) < limit: break offset += limit return results - def send_query(query, endpoint): """ - Sends a query to Virtuoso + Sends a query to Virtuoso. Args: query: Query to be sent endpoint: Endpoint where Virtuoso resides Returns: List of parts from Virtuoso - """ params = {'query': query} - if endpoint == config_manager.load_config()['sparql_endpoint']: - params['default-graph-uri'] = '' # config_manager.load_config()['synbiohub_public_graph'] + if endpoint == config['sparql_endpoint']: + params['default-graph-uri'] = '' # Modify this if needed - url = endpoint + urllib.parse.urlencode(params) + url = f"{endpoint}{urllib.parse.urlencode(params)}" headers = {'Accept': 'application/json'} try: r = requests.get(url, headers=headers) - except Exception as e: - logger_.log("[ERROR] exception when connecting: " + str(e)) + r.raise_for_status() # Raises an error for bad HTTP responses + except requests.RequestException as e: + logger_.log(f"[ERROR] exception when connecting: {str(e)}") raise Exception("Local SynBioHub isn't responding") - if r.status_code != 200: - logger_.log('[ERROR] Got status code when querying: ' + str(r.status_code)) - logger_.log(r.text) - raise Exception(url + ' is not responding') - - results = [] - - for binding in r.json()['results']['bindings']: - result = {} - for key in binding: - result[key] = binding[key]['value'] - results.append(result) + results = [ + {key: binding[key]['value'] for key in binding} + for binding in r.json()['results']['bindings'] + ] return results - From ccad037d53c92c4be4b3b3c88544571f7ac2458d Mon Sep 17 00:00:00 2001 From: cl117 Date: Wed, 28 Aug 2024 16:04:40 -0600 Subject: [PATCH 07/26] fix the bug in search.py --- flask/index.py | 218 +++++++++++++++++++++--------------------------- flask/search.py | 98 +++++++++++++++++----- 2 files changed, 171 insertions(+), 145 deletions(-) diff --git a/flask/index.py b/flask/index.py index 0762a72..de3a796 100644 --- a/flask/index.py +++ b/flask/index.py @@ -5,222 +5,198 @@ import json from logger import Logger +# Load config and initialize managers once config_manager = ConfigManager() +config = config_manager.load_config() elasticsearch_manager = ElasticsearchManager(config_manager) logger_ = Logger() def add_pagerank(parts_response, uri2rank): """ - Adds the pagerank score for each part + Adds the pagerank score for each part. Arguments: parts_response {List} -- List containing all parts from the SPARQL query - uri2rank {List} -- List of each part and its calculated pagerank score + uri2rank {Dict} -- Dictionary of each part and its calculated pagerank score """ - for part in parts_response: - subject = part['subject'] - - if subject in uri2rank: - part['pagerank'] = uri2rank[subject] - else: - part['pagerank'] = 1 + part['pagerank'] = uri2rank.get(part['subject'], 1) def add_keywords(parts_response): """ - Adds the displayId to the 'keyword' category - + Adds the displayId to the 'keyword' category. + Arguments: parts_response {List} -- List containing all parts from the SPARQL query """ - for part in parts_response: - keywords = [] - - displayId = part.get('displayId') - if displayId is not None: - keywords.extend(displayId.split('_')) + display_id = part.get('displayId') + if display_id: + part['keywords'] = ' '.join(display_id.split('_')) + else: + part['keywords'] = '' - part['keywords'] = ' '.join(keywords) -def add_roles(parts_response): +def add_roles(parts_response, term_list): """ - Adds the synonyms from the SO-Ontologies list to each part's keyword category - + Adds the synonyms from the SO-Ontologies list to each part's keyword category. + Arguments: parts_response {List} -- List containing all parts from the SPARQL query + term_list {List} -- List of terms from the SO-Ontologies """ - with open('so-simplified.json','r') as so_json: - term_list = json.load(so_json) - - for part in parts_response: - # Split the CSV of roles from sparql - role = part.get('role') + print("parts_response: ", len(parts_response)) + print("term_list: ", len(term_list)) + for part in parts_response: + # Split the CSV of roles from sparql + role = part.get('role') + if role and 'identifiers.org' in role: + keywords_list = [] + so_term = role[-10:].replace(':','_') + + for term in term_list: + if so_term in term['id']: + keywords_list.append(term['lbl']) + synonyms = term.get('synonyms', []) + for synonym in synonyms: + # remove the annoying header from the synonyms + if 'INSDC' in synonym: + synonym = synonym.replace('INSDC_qualifier:', '') + if synonym not in keywords_list: + keywords_list.append(synonym) + + part['keywords'] += ' ' + ' '.join(keywords_list) - if role is not None and 'identifiers.org' in role: - keywords_list = [] - so_term = role[-10:] - so_term = so_term.replace(':','_') - - for term in term_list: - if so_term in term['id']: - keywords_list.append(term['lbl']) - - if 'synonyms' in term and term['synonyms'] is not None: - for synonym in term['synonyms']: - - # remove the annoying header from the synonyms - if 'INSDC' in synonym: - synonym = synonym.replace('INSDC_qualifier:', '') - - if synonym not in keywords_list: - keywords_list.append(synonym) - - for keyword in keywords_list: - part['keywords'] += ' ' + keyword def add_sbol_type(parts_response): for part in parts_response: sbol_type = part.get('sboltype') + if sbol_type and 'http://www.biopax.org/release/biopax-level3.owl#' in sbol_type: + type_ = sbol_type[48:] + if 'region' in type_: + type_ = type_.replace('Region','') + part['keywords'] += ' ' + type_ - if sbol_type is not None and 'http://www.biopax.org/release/biopax-level3.owl#' in sbol_type: - type = sbol_type[48:] - - if 'region' in type: - type = type.replace('Region','') - - part['keywords'] += ' ' + type def create_parts_index(index_name): """ - Creates a new index + Creates a new index. Arguments: index_name {String} -- Name of the new index """ - - if elasticsearch_manager.get_es().indices.exists(index_name): + es = elasticsearch_manager.get_es() + if es.indices.exists(index_name): logger_.log('Index already exists -> deleting', True) - elasticsearch_manager.get_es().indices.delete(index=index_name) + es.indices.delete(index=index_name) body = { 'mappings': { - index_name: { - 'properties': { - 'subject': { - 'type': 'keyword' - }, - 'graph': { - 'type': 'keyword' - } - }, + 'properties': { + 'subject': {'type': 'keyword'}, + 'graph': {'type': 'keyword'} } }, - "settings": { - "number_of_shards": 1 + 'settings': { + 'number_of_shards': 1 } - } - elasticsearch_manager.get_es().indices.create(index=index_name, body=body) + es.indices.create(index=index_name, body=body) logger_.log('Index created', True) def bulk_index_parts(parts_response, index_name): """ - Adds each part as a document to the index - + Adds each part as a document to the index. + Arguments: parts_response {List} -- List containing all parts from the SPARQL query index_name {String} -- Name of the index - - Raises: - Exception -- Indexing fails """ - - actions = [] - for i in range(len(parts_response)): - action = { - '_index': index_name, - '_type': index_name, - '_id': parts_response[i].get('subject'), - '_source': parts_response[i] - } - - actions.append(action) + es = elasticsearch_manager.get_es() + + def actions(): + for part in parts_response: + yield { + '_index': index_name, + '_id': part['subject'], + '_source': part + } logger_.log('Bulk indexing', True) try: - stats = helpers.bulk(elasticsearch_manager.get_es(), actions) + stats = helpers.bulk(es, actions()) logger_.log('Bulk indexing complete', True) - except: - logger_.log('[ERROR] Error_messages: ' + '\n'.join(stats[1]), True) - raise Exception("Bulk indexing failed") + except Exception as e: + logger_.log(f'[ERROR] Error during bulk indexing: {str(e)}' + '\n'.join(stats[1]), True) + raise + def update_index(uri2rank): """ - Main method - Args: - uri2rank: List of pageranks for each URI - - Returns: + Main method to update the index. + Args: + uri2rank: Dictionary of pageranks for each URI """ - index_name = config_manager.load_config()['elasticsearch_index_name'] + index_name = config['elasticsearch_index_name'] logger_.log('------------ Updating index ------------', True) - logger_.log('******** Query for parts ********', True) - parts_response = query.query_parts(indexing = True) + parts_response = query.query_parts(indexing=True) logger_.log('******** Query for parts complete ********', True) logger_.log('******** Adding parts to new index ********', True) add_pagerank(parts_response, uri2rank) add_keywords(parts_response) - add_roles(parts_response) + + # Load the SO-Ontologies list once + with open('so-simplified.json', 'r') as so_json: + term_list = json.load(so_json) + add_roles(parts_response, term_list) + add_sbol_type(parts_response) create_parts_index(index_name) bulk_index_parts(parts_response, index_name) - logger_.log('******** Finished adding ' + str(len(parts_response)) + ' parts to index ********', True) - + logger_.log(f'******** Finished adding {len(parts_response)} parts to index ********', True) logger_.log('------------ Successfully updated index ------------\n', True) def delete_subject(subject): """ - Delete part for incremental indexing - Args: - subject: - - Returns: + Delete part for incremental indexing. + Args: + subject: The subject to delete from the index. """ - index_name = config_manager.load_config()['elasticsearch_index_name'] + index_name = config['elasticsearch_index_name'] + es = elasticsearch_manager.get_es() body = { 'query': { 'bool': { 'must': [ - {'ids': {'values': subject}} + {'ids': {'values': [subject]}} ] } }, 'conflicts': 'proceed' } - elasticsearch_manager.get_es().delete_by_query(index=index_name, doc_type=index_name, body=body) + es.delete_by_query(index=index_name, body=body) def index_part(part): delete_subject(part['subject']) - index_name = config_manager.load_config()['elasticsearch_index_name'] - elasticsearch_manager.get_es().index(index=index_name, doc_type=index_name, id=part['subject'], body=part) + index_name = config['elasticsearch_index_name'] + es = elasticsearch_manager.get_es() + es.index(index=index_name, id=part['subject'], body=part) def refresh_index(subject, uri2rank): delete_subject(subject) - - part_response = query.query_parts('', 'FILTER (?subject = <' + subject + '>)', True) + part_response = query.query_parts('', f'FILTER (?subject = <{subject}>)', True) if len(part_response) == 1: add_pagerank(part_response, uri2rank) @@ -246,18 +222,16 @@ def incremental_remove(subject): def incremental_remove_collection(subject, uri_prefix): - collection_membership_query = ''' + collection_membership_query = f''' SELECT ?s - WHERE { - <''' + subject + '''> sbol2:member ?s . - FILTER(STRSTARTS(str(?s),''' + "'" + uri_prefix + "'" + ''')) - } + WHERE {{ + <{subject}> sbol2:member ?s . + FILTER(STRSTARTS(str(?s), '{uri_prefix}')) + }} ''' members = query.query_sparql(collection_membership_query) delete_subject(subject) for member in members: delete_subject(member['s']) - - diff --git a/flask/search.py b/flask/search.py index d772b8f..e8e7926 100644 --- a/flask/search.py +++ b/flask/search.py @@ -62,13 +62,20 @@ def search_es(es_query: str) -> Dict: } try: return elasticsearch_manager.get_es().search(index=config_manager.load_config()['elasticsearch_index_name'], body=body) - except Exception as e: - logger_.log(f"ES search failed: {e}") + except: + logger_.log("search_es(es_query: str)") raise def empty_search_es(offset: int, limit: int, allowed_graphs: List[str]) -> Dict: """ Empty string search based solely on pagerank. + Arguments: + offset {int} -- Offset for search results + limit {int} -- Size of search + allowed_graphs {List} -- List of allowed graphs to search on + + Returns: + List -- List of search results """ query = {'term': {'graph': allowed_graphs[0]}} if len(allowed_graphs) == 1 else {'terms': {'graph': allowed_graphs}} @@ -88,13 +95,19 @@ def empty_search_es(offset: int, limit: int, allowed_graphs: List[str]) -> Dict: } try: return elasticsearch_manager.get_es().search(index=config_manager.load_config()['elasticsearch_index_name'], body=body) - except Exception as e: - logger_.log(f"ES search failed: {e}") + except: + logger_.log("empty_search_es(offset: int, limit: int, allowed_graphs: List[str])") raise def search_es_allowed_subjects(es_query: str, allowed_subjects: List[str]) -> Dict: """ String query for ES searches limited to allowed parts. + Arguments: + es_query {string} -- String to search for + allowed_subjects {list} - list of allowed subjects from Virtuoso + + Returns: + List -- List of all search results """ body = { 'query': { @@ -116,7 +129,7 @@ def search_es_allowed_subjects(es_query: str, allowed_subjects: List[str]) -> Di 'operator': 'or', 'fuzziness': 'AUTO' }}, - {'ids': {'values': allowed_subjects}} + {'ids': {'values': list(allowed_subjects)}} ] } }, @@ -124,21 +137,26 @@ def search_es_allowed_subjects(es_query: str, allowed_subjects: List[str]) -> Di 'script': { 'source': "_score * Math.log(doc['pagerank'].value + 1)" } - } - } + }, + }, }, 'from': 0, 'size': 10000 } try: return elasticsearch_manager.get_es().search(index=config_manager.load_config()['elasticsearch_index_name'], body=body) - except Exception as e: - logger_.log(f"ES search failed: {e}") + except: + logger_.log("search_es_allowed_subjects(es_query: str, allowed_subjects: List[str])") raise -def search_es_allowed_subjects_empty_string(allowed_subjects: List[str]) -> Dict: +def search_es_allowed_subjects_empty_string(allowed_subjects: List[str]): """ ES search purely limited to allowed parts. + Arguments: + allowed_subjects {list} - list of allowed subjects from Virtuoso + + Returns: + List -- List of all search results """ body = { 'query': { @@ -146,7 +164,7 @@ def search_es_allowed_subjects_empty_string(allowed_subjects: List[str]) -> Dict 'query': { 'bool': { 'must': [ - {'ids': {'values': allowed_subjects}} + {'ids': {'values': list(allowed_subjects)}} ] } }, @@ -154,16 +172,16 @@ def search_es_allowed_subjects_empty_string(allowed_subjects: List[str]) -> Dict 'script': { 'source': "_score * Math.log(doc['pagerank'].value + 1)" } - } - } + }, + }, }, 'from': 0, 'size': 10000 } try: return elasticsearch_manager.get_es().search(index=config_manager.load_config()['elasticsearch_index_name'], body=body) - except Exception as e: - logger_.log(f"ES search failed: {e}") + except: + logger_.log("search_es_allowed_subjects_empty_string") raise def parse_sparql_query(sparql_query, is_count_query): # Find FROM clause @@ -209,13 +227,18 @@ def extract_query(sparql_query): List -- List of information extracted """ return parse_sparql_query(sparql_query, is_count_query(sparql_query)) - - + def extract_allowed_graphs(_from: str, default_graph_uri: str) -> List[str]: """ Extracts the allowed graphs to search over. + Arguments: + _from {string} -- Graph where search originated + default_graph_uri {string} -- The default graph URI pulled from SBH + + Returns: + List -- List of allowed graphs """ - allowed_graphs = [default_graph_uri] if not _from else [graph.strip()[1:-1] for graph in _from.split('FROM') if graph.strip()] + allowed_graphs = [default_graph_uri] if not _from else [graph.strip()[1:-1] for graph in _from.split('FROM') if graph.strip()[1:-1]] if config_manager.load_config()['distributed_search']: allowed_graphs.extend(instance['instanceUrl'] + '/public' for instance in wor_client_.get_wor_instance()) return allowed_graphs @@ -226,6 +249,14 @@ def is_count_query(sparql_query: str) -> bool: def create_response(count: int, bindings: List[Dict], return_count: bool) -> Dict: """ Creates response to be sent back to SBH. + + Arguments: + count {int} -- ? + bindings {Dict} -- The bindings + return_count {int} -- ? + + Returns: + ? -- ? """ if return_count: return { @@ -254,6 +285,24 @@ def create_binding(subject: str, displayId: Optional[str], version: Optional[int percentMatch: float = -1, strandAlignment: str = 'N/A', CIGAR: str = 'N/A') -> Dict: """ Creates bindings to be sent to SBH. + Arguments: + subject {string} -- URI of part + displayId {string} -- DisplayId of part + version {int} -- Version of part + name {string} -- Name of part + description {string} -- Description of part + _type {string} -- SBOL type of part + role {string} -- S.O. role of part + order_by {?} -- ? + + Keyword Arguments: + percentMatch {number} -- Percent match of query part to the target part (default: {-1}) + strandAlignment {str} -- Strand alignment of the query part relatve to the target part (default: {'N/A'}) + CIGAR {str} -- Alignment of query part relative to the target part (default: {'N/A'}) + + Returns: + Dict -- Part and its information + """ binding = {} attributes = { @@ -292,6 +341,10 @@ def create_bindings(es_response, clusters, allowed_graphs, allowed_subjects=None Returns: Dict -- All parts and their corresponding information """ + if es_response is None or 'hits' not in es_response or 'hits' not in es_response['hits']: + logger_.log("[ERROR] Elasticsearch response is None or malformed.") + return [] + bindings = [] cluster_duplicates = set() @@ -314,7 +367,7 @@ def create_bindings(es_response, clusters, allowed_graphs, allowed_subjects=None elif subject in clusters: cluster_duplicates.update(clusters[subject]) - if 'http://sbols.org/v2#Sequence' in _source.get('type', ''): + if _source.get('type') is not None and 'http://sbols.org/v2#Sequence' in _source.get('type'): _score /= 10.0 binding = create_binding( @@ -348,7 +401,8 @@ def create_criteria_bindings(criteria_response, uri2rank, sequence_search=False, Dict -- Binding of parts """ bindings = [] - for part in criteria_response: + parts = (p for p in criteria_response if p.get('role') is None or 'http://wiki.synbiohub.org' in p.get('role')) + for part in parts: subject = part.get('subject') pagerank = uri2rank.get(subject, 1) @@ -570,6 +624,4 @@ def filter_sequence_search_subjects(_from, uris): uris {list} -- List of URI's from sequence search """ from_uris = set(re.findall(r"\<([A-Za-z0-9:\/.]+)\>*", _from)) - return [uri for uri in uris if any(f in uri for f in from_uris)] - - + return [uri for uri in uris if any(f in uri for f in from_uris)] \ No newline at end of file From 383359a7c794568b71b76f5a00ed3dc78f4f9c84 Mon Sep 17 00:00:00 2001 From: cl117 Date: Thu, 29 Aug 2024 16:59:08 -0600 Subject: [PATCH 08/26] fix the bug in index.py --- flask/cluster.py | 3 ++- flask/index.py | 35 +++++++++++++++++++++++------------ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/flask/cluster.py b/flask/cluster.py index 97332ce..7e5db86 100644 --- a/flask/cluster.py +++ b/flask/cluster.py @@ -47,7 +47,8 @@ def write_fasta(sequences): def run_uclust(): args = [usearch_binary_filename, '-cluster_fast', sequences_filename, '-id', uclust_identity, '-sort', 'length', '-uc', uclust_results_filename] - result = subprocess.run(args, capture_output=True, text=True) + # result = subprocess.run(args, capture_output=True, text=True) # Python3.7 + result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) logger_.log(result.stdout, True) def analyze_uclust(): diff --git a/flask/index.py b/flask/index.py index de3a796..ec2c476 100644 --- a/flask/index.py +++ b/flask/index.py @@ -59,12 +59,13 @@ def add_roles(parts_response, term_list): if so_term in term['id']: keywords_list.append(term['lbl']) synonyms = term.get('synonyms', []) - for synonym in synonyms: - # remove the annoying header from the synonyms - if 'INSDC' in synonym: - synonym = synonym.replace('INSDC_qualifier:', '') - if synonym not in keywords_list: - keywords_list.append(synonym) + if synonyms: + for synonym in synonyms: + # remove the annoying header from the synonyms + if 'INSDC' in synonym: + synonym = synonym.replace('INSDC_qualifier:', '') + if synonym not in keywords_list: + keywords_list.append(synonym) part['keywords'] += ' ' + ' '.join(keywords_list) @@ -93,16 +94,25 @@ def create_parts_index(index_name): body = { 'mappings': { - 'properties': { - 'subject': {'type': 'keyword'}, - 'graph': {'type': 'keyword'} + index_name: { + 'properties': { + 'subject': { + 'type': 'keyword' + }, + 'graph': { + 'type': 'keyword' + } + }, } }, 'settings': { 'number_of_shards': 1 } } + logger_.log("index_name: ", index_name) # empty + logger_.log("body: ", body) # empty es.indices.create(index=index_name, body=body) + logger_.log('Index created', True) @@ -120,6 +130,7 @@ def actions(): for part in parts_response: yield { '_index': index_name, + '_type': index_name, '_id': part['subject'], '_source': part } @@ -178,20 +189,20 @@ def delete_subject(subject): 'query': { 'bool': { 'must': [ - {'ids': {'values': [subject]}} + {'ids': {'values': subject}} ] } }, 'conflicts': 'proceed' } - es.delete_by_query(index=index_name, body=body) + es.delete_by_query(index=index_name, doc_type=index_name, body=body) def index_part(part): delete_subject(part['subject']) index_name = config['elasticsearch_index_name'] es = elasticsearch_manager.get_es() - es.index(index=index_name, id=part['subject'], body=part) + es.index(index=index_name, doc_type=index_name, id=part['subject'], body=part) def refresh_index(subject, uri2rank): From e457a8e6d5094ec3844a22897520590f36f39463 Mon Sep 17 00:00:00 2001 From: cl117 Date: Thu, 29 Aug 2024 23:03:21 -0600 Subject: [PATCH 09/26] optimize pagerank.py --- flask/index.py | 4 -- flask/pagerank.py | 149 +++++++++++++--------------------------------- flask/search.py | 2 +- 3 files changed, 42 insertions(+), 113 deletions(-) diff --git a/flask/index.py b/flask/index.py index ec2c476..8724552 100644 --- a/flask/index.py +++ b/flask/index.py @@ -46,8 +46,6 @@ def add_roles(parts_response, term_list): parts_response {List} -- List containing all parts from the SPARQL query term_list {List} -- List of terms from the SO-Ontologies """ - print("parts_response: ", len(parts_response)) - print("term_list: ", len(term_list)) for part in parts_response: # Split the CSV of roles from sparql role = part.get('role') @@ -109,8 +107,6 @@ def create_parts_index(index_name): 'number_of_shards': 1 } } - logger_.log("index_name: ", index_name) # empty - logger_.log("body: ", body) # empty es.indices.create(index=index_name, body=body) logger_.log('Index created', True) diff --git a/flask/pagerank.py b/flask/pagerank.py index 2b319f1..ccd3040 100644 --- a/flask/pagerank.py +++ b/flask/pagerank.py @@ -1,4 +1,3 @@ -from xml.etree import ElementTree import numpy as np import query from logger import Logger @@ -25,84 +24,34 @@ } ''' - -class graph: - # create uri to index mapping - def init_mapping(self, adjacency_list): - uris = set() - for parent in adjacency_list: - uris.add(parent) - for child in adjacency_list[parent]: - uris.add(child) - - self.index2uri = list(uris) - self.uri2index = {} - - for i in range(len(self.index2uri)): - uri = self.index2uri[i] - self.uri2index[uri] = i - - # assert mappings are correct - for i in range(len(self.index2uri)): - uri = self.index2uri[i] - index = self.uri2index[uri] - assert(index == i) - - - def init_in_links(self, adjacency_list): - for j in range(self.size): - self.in_links[j] = [] - - for parent in adjacency_list: - for child in adjacency_list[parent]: - parent_idx = self.uri2index[parent] - child_idx = self.uri2index[child] - self.in_links[child_idx].append(parent_idx) - - - def init_number_out_links(self, adjacency_list): - for j in range(self.size): - self.number_out_links[j] = 0 - - for parent in adjacency_list: - parent_idx = self.uri2index[parent] - number_children = len(adjacency_list[parent]) - self.number_out_links[parent_idx] = number_children - - - def init_dangling_pages(self, adjacency_list): - for parent in adjacency_list: - number_children = len(adjacency_list[parent]) - if number_children == 0: - self.dangling_pages.add(self.uri2index[parent]) - - +class Graph: def __init__(self, adjacency_list): - self.index2uri = [] - self.uri2index = {} - self.init_mapping(adjacency_list) - + self.uri2index = {uri: idx for idx, uri in enumerate(adjacency_list)} + self.index2uri = list(adjacency_list.keys()) self.size = len(self.index2uri) - - self.in_links = {} - self.init_in_links(adjacency_list) - - self.number_out_links = {} - self.init_number_out_links(adjacency_list) - + + self.in_links = {_:[] for _ in range(self.size)} + self.number_out_links = {_:0 for _ in range(self.size)} self.dangling_pages = set() - self.init_dangling_pages(adjacency_list) + for parent, children in adjacency_list.items(): + parent_idx = self.uri2index[parent] + if children: + self.number_out_links[parent_idx] = len(children) + for child in children: + child_idx = self.uri2index[child] + self.in_links[child_idx].append(parent_idx) + else: + self.dangling_pages.add(parent_idx) -# add uris as keys to adjacency_list -def populate_uris(uri_response): - adjacency_list = {} + def get_dangling_contrib(self, p): + return sum([p[j] for j in self.dangling_pages]) / self.size - for uri in uri_response: - adjacency_list[uri['subject']] = set() - - return adjacency_list + def get_teleportation_contrib(self): + return 1.0 / self.size +def populate_uris(uri_response): + return {uri['subject']: set() for uri in uri_response} # add edges def populate_links(link_response, adjacency_list): @@ -111,12 +60,10 @@ def populate_links(link_response, adjacency_list): adjacency_list[link['parent']].add(link['child']) except: raise - def pagerank(g, s=0.85, tolerance=0.001): n = g.size - p = np.matrix(np.ones((n, 1))) / n - + p = np.ones(n) / n # Initial probability distribution vector if n == 0: logger_.log('no iterations: empty graph', True) @@ -124,61 +71,47 @@ def pagerank(g, s=0.85, tolerance=0.001): iteration = 1 delta = 2 - - while delta > tolerance: - v = np.matrix(np.zeros((n, 1))) - dangling_contrib = sum([p[j] for j in g.dangling_pages]) / n - teleportation_contrib = 1 / n - + while delta > tolerance: + v = np.zeros(n) + dangling_contrib = g.get_dangling_contrib(p) + teleportation_contrib = g.get_teleportation_contrib() + for j in range(n): - link_contrib = sum([p[k] / g.number_out_links[k] for k in g.in_links[j]]) - v[j] = s * link_contrib + s * dangling_contrib + (1 - s) * teleportation_contrib - new_p = v / np.sum(v) - - delta = np.sum(np.abs(p - new_p)) - logger_.log('Iteration ' + str(iteration) + ': L1 norm delta is ' + str(delta), True) + in_link_contrib = np.sum(p[k] / g.number_out_links[k] for k in g.in_links[j]) + v[j] = s * (in_link_contrib + dangling_contrib) + (1 - s) * teleportation_contrib + + v /= np.sum(v) + delta = np.sum(np.abs(p - v)) + logger_.log(f'Iteration {iteration}: L1 norm delta is {delta}', True) - p = new_p + p = v iteration += 1 - - return p + return p def make_uri2rank(pr_vector, uri2index): - uri2rank = {} - - try: - for uri in uri2index: - uri2rank[uri] = pr_vector[uri2index[uri]] - except: - raise - - return uri2rank - + return {uri: pr_vector[idx] for uri, idx in uri2index.items()} def update_pagerank(): logger_.log('------------ Updating pagerank ------------', True) logger_.log('******** Query for uris ********', True) uri_response = query.query_sparql(uri_query) logger_.log('******** Query for uris complete ********', True) + adjacency_list = populate_uris(uri_response) logger_.log('******** Query for links ********', True) link_response = query.query_sparql(link_query) logger_.log('******** Query for links complete ********', True) + populate_links(link_response, adjacency_list) - g = graph(adjacency_list) + g = Graph(adjacency_list) + logger_.log('******** Running pagerank ********', True) - pr = pagerank(g, tolerance=float(config_manager.load_config()['pagerank_tolerance'])) + pr_vector = pagerank(g, tolerance=float(config_manager.load_config()['pagerank_tolerance'])) logger_.log('******** Running pagerank complete ********', True) logger_.log('------------ Successfully updated pagerank ------------\n', True) - pr_vector = np.squeeze(np.asarray(pr)) - - # after squeeze, make sure it at least has a dimension in the case that there is only one element - if pr_vector.shape == (): - pr_vector = np.array([pr_vector]) return make_uri2rank(pr_vector, g.uri2index) - diff --git a/flask/search.py b/flask/search.py index e8e7926..2e314ed 100644 --- a/flask/search.py +++ b/flask/search.py @@ -212,7 +212,7 @@ def parse_sparql_query(sparql_query, is_count_query): # Construct es_query es_query = ' '.join(keywords).strip() - print("Hello es_query: ", es_query) + #print("Hello es_query: ", es_query) return es_query, _from, criteria, offset, limit, sequence, flags From f8d8009ff913a8620306bc9c38fb86940a56c6f9 Mon Sep 17 00:00:00 2001 From: cl117 Date: Fri, 30 Aug 2024 11:40:39 -0600 Subject: [PATCH 10/26] optimize sequencesearch.py --- flask/sequencesearch.py | 104 ++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/flask/sequencesearch.py b/flask/sequencesearch.py index 89d6964..f9c0dec 100644 --- a/flask/sequencesearch.py +++ b/flask/sequencesearch.py @@ -1,43 +1,46 @@ -from xml.etree import ElementTree +import os import subprocess -import query -import cluster -import search -from sys import platform -import base64 import tempfile +from sys import platform from logger import Logger +import cluster logger_ = Logger() - -# handling selection of VSEARCH binary -if platform == "linux" or platform == "linux2": - vsearch_binary_filename = 'usearch/vsearch_linux' -elif platform == "darwin": - vsearch_binary_filename = 'usearch/vsearch_macos' -else: - logger_.log("Sorry, your OS is not supported for sequence based-search.") - -# add valid flags to here -globalFlags = {'maxaccepts': '50', 'id': '0.8', 'iddef': '2', 'maxrejects': '0', 'maxseqlength': '5000', 'minseqlength': '20'} -exactFlags = {} +# Handling selection of VSEARCH binary +vsearch_binaries = { + "linux": "usearch/vsearch_linux", + "darwin": "usearch/vsearch_macos" +} + +vsearch_binary_filename = vsearch_binaries.get(platform, None) +if not vsearch_binary_filename: + logger_.log("Sorry, your OS is not supported for sequence-based search.") + +# Predefined global and exact search flags +global_flags = { + 'maxaccepts': '50', + 'id': '0.8', + 'iddef': '2', + 'maxrejects': '0', + 'maxseqlength': '5000', + 'minseqlength': '20' +} +exact_flags = {} def write_to_temp(sequence): """ - Writes text sequence to temp FASTA file for search + Writes a text sequence to a temporary FASTA file for search. Arguments: - sequence {string} -- Sequence to write to file + sequence {str} -- Sequence to write to file Returns: - string -- file path + str -- Path to the temp file """ - temp = tempfile.NamedTemporaryFile(suffix=".fsa",delete=False) - with open(temp.name, 'w') as f: - f.write('>sequence_to_search\n') - f.write('%s\n' % sequence) - return temp.name + with tempfile.NamedTemporaryFile(suffix=".fsa", delete=False, mode='w') as temp_file: + temp_file.write(f'>sequence_to_search\n{sequence}\n') + return temp_file.name # pass in the sequence to this function, replace searchsequence.fsa with the query sequence def run_vsearch_global(fileName): @@ -47,14 +50,16 @@ def run_vsearch_global(fileName): Arguments: fileName {string} -- Path to file """ + # setting maxaccepts to 0 disables the limit (searches for all possible matches) args = [vsearch_binary_filename, '--usearch_global', fileName, '--db', 'dumps/sequences.fsa','--uc', fileName[:-4] + '.uc', '--uc_allhits',] - args = append_flags_to_args(args, globalFlags) - popen = subprocess.Popen(args, stdout=subprocess.PIPE) + args = append_flags_to_args(args, global_flags) + + popen = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) popen.wait() output = popen.stdout.read() logger_.log(output) - + def run_vsearch_exact(fileName): """ Runs the "search_exact" command @@ -64,13 +69,12 @@ def run_vsearch_exact(fileName): """ # setting maxaccepts to 0 disables the limit (searches for all possible matches) args = [vsearch_binary_filename, '--search_exact', fileName, '--db', 'dumps/sequences.fsa','--uc', fileName[:-4] + '.uc', '--uc_allhits'] - args = append_flags_to_args(args, exactFlags) + args = append_flags_to_args(args, exact_flags) popen = subprocess.Popen(args, stdout=subprocess.PIPE) popen.wait() output = popen.stdout.read() logger_.log(output) - def append_flags_to_args(argsList, flags): """ Append user flags to VSEARCH command line args @@ -95,9 +99,8 @@ def add_global_flags(userFlags): userFlags {dict} -- flags selected by user """ for flag in userFlags: - if flag in globalFlags: - globalFlags[flag] = userFlags[flag] - + if flag in global_flags: + global_flags[flag] = userFlags[flag] def add_exact_flags(userFlags): """ @@ -109,32 +112,29 @@ def add_exact_flags(userFlags): userFlags {dict} -- flags selected by user """ for flag in userFlags: - if flag in exactFlags: - exactFlags[flag] = userFlags[flag] + if flag in exact_flags: + exact_flags[flag] = userFlags[flag] - -def sequence_search(userFlags, fileName): +def sequence_search(user_flags, file_name): """ - Main method - - Handles all search queries + Handles all search queries. Arguments: - userFlags {dict} -- flags selected by user - fileName {string} -- path to temp file + user_flags {dict} -- Flags selected by the user + file_name {str} -- Path to the temp file Returns: - set -- search results by URI + set -- Search results by URI """ logger_.log('Starting sequence search') - - if "search_exact" in userFlags: - add_exact_flags(userFlags) - run_vsearch_exact(fileName) + + if "search_exact" in user_flags: + add_exact_flags(user_flags) + run_vsearch_exact(file_name) else: - add_global_flags(userFlags) - run_vsearch_global(fileName) + add_global_flags(user_flags) + run_vsearch_global(file_name) logger_.log('Sequence search complete') - - return cluster.uclust2uris(fileName[:-4] + '.uc') + + return cluster.uclust2uris(file_name[:-4] + '.uc') From 9e447433ba6db228673018ee6c0487e7805838e4 Mon Sep 17 00:00:00 2001 From: cl117 Date: Fri, 30 Aug 2024 11:55:44 -0600 Subject: [PATCH 11/26] rm debug toolbar --- flask/explorer.py | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/flask/explorer.py b/flask/explorer.py index 93ae178..a2a3660 100644 --- a/flask/explorer.py +++ b/flask/explorer.py @@ -17,8 +17,6 @@ from elasticsearchManager import ElasticsearchManager from logger import Logger -from flask_debugtoolbar import DebugToolbarExtension -from flask_debugtoolbar_lineprofilerpanel.profile import line_profile # Configure logging, This will affect all loggers in your application, not just the Werkzeug logger. log = logging.getLogger('werkzeug') @@ -30,26 +28,7 @@ logger_ = Logger() app = Flask(__name__) -app.config.update( - SECRET_KEY='your-secret-key', # Required for the debug toolbar - DEBUG=True, - DEBUG_TB_INTERCEPT_REDIRECTS=False, - DEBUG_TB_PROFILER_ENABLED=True, - DEBUG_TB_PANELS=[ - 'flask_debugtoolbar.panels.versions.VersionDebugPanel', - 'flask_debugtoolbar.panels.timer.TimerDebugPanel', - 'flask_debugtoolbar.panels.headers.HeaderDebugPanel', - 'flask_debugtoolbar.panels.request_vars.RequestVarsDebugPanel', - 'flask_debugtoolbar.panels.config_vars.ConfigVarsDebugPanel', - 'flask_debugtoolbar.panels.template.TemplateDebugPanel', - 'flask_debugtoolbar.panels.logger.LoggingPanel', - 'flask_debugtoolbar.panels.profiler.ProfilerDebugPanel', - 'flask_debugtoolbar_lineprofilerpanel.panels.LineProfilerPanel' - ] -) - -# Initialize the debug toolbar -toolbar = DebugToolbarExtension(app) + # Error handler @app.errorhandler(Exception) @@ -180,12 +159,10 @@ def incremental_remove_collection(): raise @app.route('/test', methods=['GET']) -@line_profile def SBOLExplore_test_endpoint(): return render_template('index.html') @app.route('/', methods=['GET']) -@line_profile def sparql_search_endpoint(): try: es = elasticsearch_manager.get_es() @@ -231,4 +208,4 @@ def search_by_string(): raise if __name__ == "__main__": - app.run(debug=True) # threaded=True + app.run(debug=False, threaded=True) # threaded=True From cdbfd2bb78714752b5d9a0cc6baaa749633fa6bf Mon Sep 17 00:00:00 2001 From: cjmyers Date: Thu, 5 Sep 2024 10:12:37 -0600 Subject: [PATCH 12/26] Update Dockerfile --- flask/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flask/docker/Dockerfile b/flask/docker/Dockerfile index 22d57f7..e5e4f2c 100644 --- a/flask/docker/Dockerfile +++ b/flask/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:16.04 +FROM ubuntu:22.04 MAINTAINER Michael Zhang RUN apt-get update && \ From 7e262e048b852e6427eb8a00fb30e11fb2099110 Mon Sep 17 00:00:00 2001 From: cjmyers Date: Thu, 5 Sep 2024 10:12:54 -0600 Subject: [PATCH 13/26] Update Dockerfile-synbiohub --- flask/docker/Dockerfile-synbiohub | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flask/docker/Dockerfile-synbiohub b/flask/docker/Dockerfile-synbiohub index ed54445..043a252 100644 --- a/flask/docker/Dockerfile-synbiohub +++ b/flask/docker/Dockerfile-synbiohub @@ -1,4 +1,4 @@ -FROM ubuntu:16.04 +FROM ubuntu:22.04 MAINTAINER Michael Zhang RUN apt-get update && \ From 20a6efcaf283080138ae39fcc7be6a1545fd2139 Mon Sep 17 00:00:00 2001 From: cjmyers Date: Thu, 5 Sep 2024 10:19:59 -0600 Subject: [PATCH 14/26] Update Dockerfile --- flask/docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flask/docker/Dockerfile b/flask/docker/Dockerfile index e5e4f2c..c8807a1 100644 --- a/flask/docker/Dockerfile +++ b/flask/docker/Dockerfile @@ -5,8 +5,8 @@ RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ - apt-get install -y git python3.6 python3-pip && \ - python3.6 -m pip install pip --upgrade && \ + apt-get install -y git python python-pip && \ + python -m pip install pip --upgrade && \ git clone https://github.com/michael13162/SBOLExplorer.git && \ cd SBOLExplorer/flask && \ pip install -r requirements.txt && \ From c6457252b62ed5f388fc2b5018fbf89f82af5e19 Mon Sep 17 00:00:00 2001 From: cjmyers Date: Thu, 5 Sep 2024 10:20:17 -0600 Subject: [PATCH 15/26] Update Dockerfile-synbiohub --- flask/docker/Dockerfile-synbiohub | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flask/docker/Dockerfile-synbiohub b/flask/docker/Dockerfile-synbiohub index 043a252..bb24bea 100644 --- a/flask/docker/Dockerfile-synbiohub +++ b/flask/docker/Dockerfile-synbiohub @@ -5,8 +5,8 @@ RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ - apt-get install -y git python3.6 python3-pip && \ - python3.6 -m pip install pip --upgrade && \ + apt-get install -y git python python-pip && \ + python -m pip install pip --upgrade && \ git clone https://github.com/michael13162/SBOLExplorer.git && \ cd SBOLExplorer/flask && \ pip install -r requirements.txt && \ From 0f76965800bfa6a056daf7011e2a62cde6387f65 Mon Sep 17 00:00:00 2001 From: cjmyers Date: Thu, 5 Sep 2024 10:29:48 -0600 Subject: [PATCH 16/26] Update Dockerfile --- flask/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flask/docker/Dockerfile b/flask/docker/Dockerfile index c8807a1..b416aad 100644 --- a/flask/docker/Dockerfile +++ b/flask/docker/Dockerfile @@ -5,7 +5,7 @@ RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ - apt-get install -y git python python-pip && \ + apt-get install -y git python3.6 python3-pip && \ python -m pip install pip --upgrade && \ git clone https://github.com/michael13162/SBOLExplorer.git && \ cd SBOLExplorer/flask && \ From b4fa8e455cfdb7a83485c1ca50a578f36e94cdb5 Mon Sep 17 00:00:00 2001 From: cjmyers Date: Thu, 5 Sep 2024 10:30:04 -0600 Subject: [PATCH 17/26] Update Dockerfile-synbiohub --- flask/docker/Dockerfile-synbiohub | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flask/docker/Dockerfile-synbiohub b/flask/docker/Dockerfile-synbiohub index bb24bea..d9e2a66 100644 --- a/flask/docker/Dockerfile-synbiohub +++ b/flask/docker/Dockerfile-synbiohub @@ -5,7 +5,7 @@ RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ - apt-get install -y git python python-pip && \ + apt-get install -y git python3.6 python3-pip && \ python -m pip install pip --upgrade && \ git clone https://github.com/michael13162/SBOLExplorer.git && \ cd SBOLExplorer/flask && \ From 1864f94ed97b2985501ed4b6891cad1afe36a95f Mon Sep 17 00:00:00 2001 From: cjmyers Date: Thu, 5 Sep 2024 10:37:19 -0600 Subject: [PATCH 18/26] Update Dockerfile --- flask/docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flask/docker/Dockerfile b/flask/docker/Dockerfile index b416aad..15afad1 100644 --- a/flask/docker/Dockerfile +++ b/flask/docker/Dockerfile @@ -5,8 +5,8 @@ RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ - apt-get install -y git python3.6 python3-pip && \ - python -m pip install pip --upgrade && \ + apt-get install -y git python3.10 python3-pip && \ + python3.10 -m pip install pip --upgrade && \ git clone https://github.com/michael13162/SBOLExplorer.git && \ cd SBOLExplorer/flask && \ pip install -r requirements.txt && \ From 53c99fd3dc214c942032c1167ff4cd2cc6a985b8 Mon Sep 17 00:00:00 2001 From: cjmyers Date: Thu, 5 Sep 2024 10:37:41 -0600 Subject: [PATCH 19/26] Update Dockerfile-synbiohub --- flask/docker/Dockerfile-synbiohub | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flask/docker/Dockerfile-synbiohub b/flask/docker/Dockerfile-synbiohub index d9e2a66..04ba3ef 100644 --- a/flask/docker/Dockerfile-synbiohub +++ b/flask/docker/Dockerfile-synbiohub @@ -5,8 +5,8 @@ RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ - apt-get install -y git python3.6 python3-pip && \ - python -m pip install pip --upgrade && \ + apt-get install -y git python3.10 python3-pip && \ + python3.10 -m pip install pip --upgrade && \ git clone https://github.com/michael13162/SBOLExplorer.git && \ cd SBOLExplorer/flask && \ pip install -r requirements.txt && \ From 17400180eeb071e8541c9e507ff79ffee65c1da7 Mon Sep 17 00:00:00 2001 From: cl117 Date: Tue, 10 Sep 2024 18:11:32 -0600 Subject: [PATCH 20/26] update python to 3.10 Flast to 3 --- flask/explorer.py | 2 +- flask/requirements.txt | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/flask/explorer.py b/flask/explorer.py index a2a3660..d32129f 100644 --- a/flask/explorer.py +++ b/flask/explorer.py @@ -38,7 +38,7 @@ def handle_error(e): return jsonify(error=str(e.name + ": " + e.description)), e.code return jsonify(error=str(type(e).__name__) + str(e)), 500 -@app.before_first_request +@app.before_request def startup(): def auto_update_index(): update_interval = int(config_manager.load_config().get('updateTimeInDays', 0)) * 86400 diff --git a/flask/requirements.txt b/flask/requirements.txt index 0ebff0e..828e708 100644 --- a/flask/requirements.txt +++ b/flask/requirements.txt @@ -1,18 +1,18 @@ certifi==2018.4.16 chardet==3.0.4 -click==6.7 +click==8.1.7 elasticsearch==6.3.0 elasticsearch-dsl==6.1.0 -Flask==1.0.2 +Flask==3.0.3 idna==2.7 ipaddress==1.0.22 -itsdangerous==0.24 -Jinja2 -MarkupSafe==2.0.1 -numpy +itsdangerous==2.2.0 +Jinja2==3.1.4 +MarkupSafe==2.1.5 +numpy==2.1.1 python-dateutil==2.7.3 requests==2.19.1 six==1.11.0 urllib3==1.23 -Werkzeug +Werkzeug==3.0.4 apscheduler==3.10.4 From da2e80ac0e0f92ea8dfc2f4e9baddcccd062b03d Mon Sep 17 00:00:00 2001 From: cl117 Date: Tue, 10 Sep 2024 22:51:29 -0600 Subject: [PATCH 21/26] update to python3.11 add venv --- flask/docker/Dockerfile | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/flask/docker/Dockerfile b/flask/docker/Dockerfile index 15afad1..ba89039 100644 --- a/flask/docker/Dockerfile +++ b/flask/docker/Dockerfile @@ -1,13 +1,25 @@ FROM ubuntu:22.04 -MAINTAINER Michael Zhang + +# Set the timezone environment variables to avoid interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=America/New_York + +# Install tzdata without interaction +RUN apt-get update && apt-get install -y tzdata + +# Set timezone +RUN ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ + dpkg-reconfigure -f noninteractive tzdata RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ - apt-get install -y git python3.10 python3-pip && \ - python3.10 -m pip install pip --upgrade && \ - git clone https://github.com/michael13162/SBOLExplorer.git && \ + apt-get install -y git cron python3.11 python3-pip python3.11-venv && \ + python3.11 -m pip install pip --upgrade && \ + python3.11 -m venv jammy && \ + . jammy/bin/activate && \ + git clone --branch change_dependencies https://github.com/SynBioDex/SBOLExplorer.git && \ cd SBOLExplorer/flask && \ pip install -r requirements.txt && \ crontab update.cron From 3269807351a755a1149046bae8c4d5d106f3c3fe Mon Sep 17 00:00:00 2001 From: cl117 Date: Tue, 10 Sep 2024 23:01:25 -0600 Subject: [PATCH 22/26] update Dockerfile-synbiohub --- flask/docker/Dockerfile-synbiohub | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/flask/docker/Dockerfile-synbiohub b/flask/docker/Dockerfile-synbiohub index 04ba3ef..ecdec92 100644 --- a/flask/docker/Dockerfile-synbiohub +++ b/flask/docker/Dockerfile-synbiohub @@ -1,13 +1,25 @@ FROM ubuntu:22.04 -MAINTAINER Michael Zhang + +#Set the timezone environment variables to avoid interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=America/New_York + +# Install tzdata without interaction +RUN apt-get update && apt-get install -y tzdata + +# Set timezone +RUN ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ + dpkg-reconfigure -f noninteractive tzdata RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ - apt-get install -y git python3.10 python3-pip && \ - python3.10 -m pip install pip --upgrade && \ - git clone https://github.com/michael13162/SBOLExplorer.git && \ + apt-get install -y git cron python3.11 python3-pip python3.11-venv && \ + python3.11 -m pip install pip --upgrade && \ + python3.11 -m venv jammy && \ + . jammy/bin/activate && \ + git clone --branch change_dependencies https://github.com/SynBioDex/SBOLExplorer.git && \ cd SBOLExplorer/flask && \ pip install -r requirements.txt && \ crontab update.cron From 225772eed57c12eeb7d3a72035c4fcb012e3dffe Mon Sep 17 00:00:00 2001 From: cl117 Date: Tue, 10 Sep 2024 23:30:57 -0600 Subject: [PATCH 23/26] update docker-compose --- flask/docker/docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flask/docker/docker-compose.yml b/flask/docker/docker-compose.yml index b57a530..9af118a 100644 --- a/flask/docker/docker-compose.yml +++ b/flask/docker/docker-compose.yml @@ -1,7 +1,7 @@ version: "3" services: sbolexplorer: - image: michael13162/sbolexplorer:latest + image: myersresearchgroup/sbolexplorer:snapshot ports: - "13162:13162" depends_on: From 14e29bacc2ea1821fd5eef74d7ed492bcb11a31c Mon Sep 17 00:00:00 2001 From: cl117 Date: Wed, 11 Sep 2024 11:12:29 -0600 Subject: [PATCH 24/26] update DockerFile to make sure container working --- flask/docker/Dockerfile | 5 ++--- flask/docker/Dockerfile-synbiohub | 3 +-- flask/start.sh | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/flask/docker/Dockerfile b/flask/docker/Dockerfile index ba89039..f071d46 100644 --- a/flask/docker/Dockerfile +++ b/flask/docker/Dockerfile @@ -12,7 +12,7 @@ RUN ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ dpkg-reconfigure -f noninteractive tzdata RUN apt-get update && \ - apt-get install -y software-properties-common && \ + apt-get install -y software-properties-common coreutils && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ apt-get install -y git cron python3.11 python3-pip python3.11-venv && \ @@ -38,5 +38,4 @@ RUN mkdir /mnt/config && \ rm -rf dumps && \ ln -s /mnt/data dumps -CMD "./start.sh" - +CMD sh -c ". ../../jammy/bin/activate && ./start.sh" diff --git a/flask/docker/Dockerfile-synbiohub b/flask/docker/Dockerfile-synbiohub index ecdec92..16e271d 100644 --- a/flask/docker/Dockerfile-synbiohub +++ b/flask/docker/Dockerfile-synbiohub @@ -40,5 +40,4 @@ RUN mkdir /mnt/config && \ ADD config-synbiohub.json /mnt/config/config.json -CMD "./start.sh" - +CMD sh -c ". ../../jammy/bin/activate && ./start.sh" diff --git a/flask/start.sh b/flask/start.sh index 66c1465..2b9765f 100755 --- a/flask/start.sh +++ b/flask/start.sh @@ -1,7 +1,7 @@ #!/bin/bash echo "Starting SBOLExplorer" - +source ../../jammy/bin/activate export FLASK_APP=explorer.py export FLASK_ENV=development flask run --host=0.0.0.0 --port=13162 From 21e58ed02aee9f2b8efdc1e7faa464cb32d2cfe0 Mon Sep 17 00:00:00 2001 From: cl117 Date: Fri, 13 Sep 2024 12:52:27 -0600 Subject: [PATCH 25/26] Update local and build in local --- flask/config.json | 2 +- flask/docker/Dockerfile | 2 +- flask/docker/Dockerfile-synbiohub | 2 +- flask/docker/config-synbiohub.json | 18 +++++++++++------- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/flask/config.json b/flask/config.json index a188e64..ceceb42 100644 --- a/flask/config.json +++ b/flask/config.json @@ -2,7 +2,7 @@ "uclust_identity": "0.8", "elasticsearch_index_name": "part", "pagerank_tolerance": "0.0001", - "elasticsearch_endpoint": "http://localhost:9200/", + "elasticsearch_endpoint": "http://elasticsearch:9200/", "sparql_endpoint": "http://localhost:8890/sparql?", "last_update_start": "none", "last_update_end": "none", diff --git a/flask/docker/Dockerfile b/flask/docker/Dockerfile index f071d46..80ae65f 100644 --- a/flask/docker/Dockerfile +++ b/flask/docker/Dockerfile @@ -38,4 +38,4 @@ RUN mkdir /mnt/config && \ rm -rf dumps && \ ln -s /mnt/data dumps -CMD sh -c ". ../../jammy/bin/activate && ./start.sh" +CMD sh -c ". ../../jammy/bin/activate && ./start.sh && cat config.json" diff --git a/flask/docker/Dockerfile-synbiohub b/flask/docker/Dockerfile-synbiohub index 16e271d..3d25fcd 100644 --- a/flask/docker/Dockerfile-synbiohub +++ b/flask/docker/Dockerfile-synbiohub @@ -40,4 +40,4 @@ RUN mkdir /mnt/config && \ ADD config-synbiohub.json /mnt/config/config.json -CMD sh -c ". ../../jammy/bin/activate && ./start.sh" +CMD sh -c ". ../../jammy/bin/activate && ./start.sh && cat config.json" diff --git a/flask/docker/config-synbiohub.json b/flask/docker/config-synbiohub.json index 1ecfca0..6db3215 100644 --- a/flask/docker/config-synbiohub.json +++ b/flask/docker/config-synbiohub.json @@ -1,10 +1,14 @@ { - "distributed_search": true, - "elasticsearch_endpoint": "http://elasticsearch:9200/", - "elasticsearch_index_name": "part", - "pagerank_tolerance": "0.0001", - "sparql_endpoint": "http://virtuoso:8890/sparql?", - "synbiohub_public_graph": "https://synbiohub.utah.edu/public", - "uclust_identity": "0.8" + "uclust_identity": "0.8", + "elasticsearch_index_name": "part", + "pagerank_tolerance": "0.0001", + "elasticsearch_endpoint": "http://elasticsearch:9200/", + "sparql_endpoint": "http://localhost:8890/sparql?", + "last_update_start": "none", + "last_update_end": "none", + "distributed_search": false, + "which_search": "vsearch", + "autoUpdateIndex": false, + "updateTimeInDays": "1" } From 3370a5968ba4b061f81ce5686472f826333ce1c1 Mon Sep 17 00:00:00 2001 From: cl117 Date: Sun, 15 Sep 2024 10:50:52 -0600 Subject: [PATCH 26/26] change to main branch --- flask/docker/Dockerfile | 4 ++-- flask/docker/Dockerfile-synbiohub | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/flask/docker/Dockerfile b/flask/docker/Dockerfile index 80ae65f..b6f5883 100644 --- a/flask/docker/Dockerfile +++ b/flask/docker/Dockerfile @@ -19,7 +19,7 @@ RUN apt-get update && \ python3.11 -m pip install pip --upgrade && \ python3.11 -m venv jammy && \ . jammy/bin/activate && \ - git clone --branch change_dependencies https://github.com/SynBioDex/SBOLExplorer.git && \ + git clone https://github.com/SynBioDex/SBOLExplorer.git && \ cd SBOLExplorer/flask && \ pip install -r requirements.txt && \ crontab update.cron @@ -38,4 +38,4 @@ RUN mkdir /mnt/config && \ rm -rf dumps && \ ln -s /mnt/data dumps -CMD sh -c ". ../../jammy/bin/activate && ./start.sh && cat config.json" +CMD sh -c ". ../../jammy/bin/activate && ./start.sh" diff --git a/flask/docker/Dockerfile-synbiohub b/flask/docker/Dockerfile-synbiohub index 3d25fcd..cfe8566 100644 --- a/flask/docker/Dockerfile-synbiohub +++ b/flask/docker/Dockerfile-synbiohub @@ -19,7 +19,7 @@ RUN apt-get update && \ python3.11 -m pip install pip --upgrade && \ python3.11 -m venv jammy && \ . jammy/bin/activate && \ - git clone --branch change_dependencies https://github.com/SynBioDex/SBOLExplorer.git && \ + git clone https://github.com/SynBioDex/SBOLExplorer.git && \ cd SBOLExplorer/flask && \ pip install -r requirements.txt && \ crontab update.cron @@ -40,4 +40,4 @@ RUN mkdir /mnt/config && \ ADD config-synbiohub.json /mnt/config/config.json -CMD sh -c ". ../../jammy/bin/activate && ./start.sh && cat config.json" +CMD sh -c ". ../../jammy/bin/activate && ./start.sh"