From 65411d408cae6d51161e199a53e116cc3f6ca419 Mon Sep 17 00:00:00 2001 From: Alkid Date: Wed, 28 Feb 2024 16:16:12 +0100 Subject: [PATCH 1/8] updated example --- examples/example_knowledge_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/example_knowledge_base.py b/examples/example_knowledge_base.py index e85f94ff..a4ca8b82 100644 --- a/examples/example_knowledge_base.py +++ b/examples/example_knowledge_base.py @@ -38,7 +38,7 @@ print('*' * 100) # Direct concept hierarchy from Top to Bottom. -for concept in kb.class_hierarchy().items(): +for concept in kb.class_hierarchy.items(): print(f'{concept.get_iri().as_str()} => {[c.get_iri().as_str() for c in kb.get_direct_sub_concepts(concept)]}') print('*' * 100) From 2c637b7deb45cf4268f148eb9561843b47bae5cb Mon Sep 17 00:00:00 2001 From: Alkid Date: Thu, 29 Feb 2024 12:29:58 +0100 Subject: [PATCH 2/8] `clean` method now cleans cache too #337 --- ontolearn/concept_learner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ontolearn/concept_learner.py b/ontolearn/concept_learner.py index f148f78f..aa74fd4b 100644 --- a/ontolearn/concept_learner.py +++ b/ontolearn/concept_learner.py @@ -791,6 +791,7 @@ def __init__(self, self.__setup() def __setup(self): + self._cache = dict() self.clean() if self.fitness_func is None: self.fitness_func = LinearPressureFitness() @@ -810,7 +811,6 @@ def __setup(self): self._result_population = None self._dp_to_prim_type = dict() self._dp_splits = dict() - self._cache = dict() self._split_properties = [] self.pset = self.__build_primitive_set() @@ -1059,7 +1059,7 @@ def clean(self): del creator.Quality except AttributeError: pass - + self._cache.clear() super().clean() From ad3d318d1483829835962268dd355fe4fef9bd92 Mon Sep 17 00:00:00 2001 From: Alkid Date: Thu, 29 Feb 2024 13:06:55 +0100 Subject: [PATCH 3/8] Facilitated optional installation --- setup.py | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index d828fe43..2bd4d451 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,10 @@ from setuptools import setup, find_packages +import re with open('README.md', 'r') as fh: long_description = fh.read() -setup( - name="ontolearn", - description="Ontolearn is an open-source software library for structured machine learning in Python. Ontolearn includes modules for processing knowledge bases, inductive logic programming and ontology engineering.", - version="0.6.2", - packages=find_packages(), - install_requires=[ - "scikit-learn>=0.24.1", + +_deps = [ "matplotlib>=3.3.4", "owlready2>=0.40", "torch>=1.7.1", @@ -19,11 +15,44 @@ "deap>=1.3.1", "httpx>=0.25.2", "tqdm>=4.64.0", - "transformers>=4.35.0", + "transformers>=4.38.1", "pytest>=7.2.2", "owlapy==0.1.1", "dicee==0.1.2", - "ontosample>=0.2.2"], + "ontosample>=0.2.2", + "gradio>=4.11.0"] + +deps = {b: a for a, b in (re.findall(r"^(([^!=<>~ ]+)(?:[!=<>~ ].*)?$)", x)[0] for x in _deps)} + + +def deps_list(*pkgs): + return [deps[pkg] for pkg in pkgs] + + +extras = dict() +extras["min"] = deps_list( + "matplotlib", + "torch", + "rdflib", + "pandas", + "sortedcontainers", + "owlready2", + "owlapy", + "flask", # Drill, NCES + "tqdm", "transformers", # NCES + "dicee", # Drill + "deap", # Evolearner +) + +extras["full"] = (extras["min"] + deps_list("httpx", "pytest", "gradio", "ontosample")) + +setup( + name="ontolearn", + description="Ontolearn is an open-source software library for structured machine learning in Python. Ontolearn includes modules for processing knowledge bases, inductive logic programming and ontology engineering.", + version="0.7.0", + packages=find_packages(), + install_requires=extras["min"], + extras_require=extras, author='Caglar Demir', author_email='caglardemir8@gmail.com', url='https://github.com/dice-group/Ontolearn', From 7151cb30a02869cc2a598cc4e7725dfdac5e9ff8 Mon Sep 17 00:00:00 2001 From: Alkid Date: Thu, 29 Feb 2024 13:16:01 +0100 Subject: [PATCH 4/8] Increased version --- docs/usage/01_introduction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage/01_introduction.md b/docs/usage/01_introduction.md index 50fbd9e6..8655af19 100644 --- a/docs/usage/01_introduction.md +++ b/docs/usage/01_introduction.md @@ -1,6 +1,6 @@ # Ontolearn -**Version:** ontolearn 0.6.1 +**Version:** ontolearn 0.7.0 **GitHub repository:** [https://github.com/dice-group/Ontolearn](https://github.com/dice-group/Ontolearn) From f58195aa81388885d26249e67f2ac13bdb8aca8c Mon Sep 17 00:00:00 2001 From: Alkid Date: Fri, 1 Mar 2024 13:19:54 +0100 Subject: [PATCH 5/8] EvoLearner resets after each `fit` call #337 --- ontolearn/concept_learner.py | 31 ++++++++++++++++++++++++------- tests/test_evolearner.py | 14 +++++++++----- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/ontolearn/concept_learner.py b/ontolearn/concept_learner.py index aa74fd4b..9a8bfbc4 100644 --- a/ontolearn/concept_learner.py +++ b/ontolearn/concept_learner.py @@ -689,7 +689,7 @@ class EvoLearner(BaseConceptLearner[EvoLearnerNode]): __slots__ = 'fitness_func', 'init_method', 'algorithm', 'value_splitter', 'tournament_size', \ 'population_size', 'num_generations', 'height_limit', 'use_data_properties', 'pset', 'toolbox', \ '_learning_problem', '_result_population', 'mut_uniform_gen', '_dp_to_prim_type', '_dp_splits', \ - '_split_properties', '_cache', 'use_card_restrictions', 'card_limit', 'use_inverse' + '_split_properties', '_cache', 'use_card_restrictions', 'card_limit', 'use_inverse', 'total_fits' name = 'evolearner' @@ -788,11 +788,12 @@ def __init__(self, self.population_size = population_size self.num_generations = num_generations self.height_limit = height_limit + self.total_fits = 0 self.__setup() def __setup(self): + self.clean(partial=True) self._cache = dict() - self.clean() if self.fitness_func is None: self.fitness_func = LinearPressureFitness() @@ -971,7 +972,11 @@ def fit(self, *args, **kwargs) -> 'EvoLearner': """ Find hypotheses that explain pos and neg. """ - self.clean() + # Don't reset everything if the user is just using this model for 1 learning problem, since he may use the + # register_op method, else-wise we need to `clean` before fitting to get a fresh fit. + if self.total_fits > 0: + self.clean() + self.total_fits += 1 learning_problem = self.construct_learning_problem(PosNegLPStandard, args, kwargs) self._learning_problem = learning_problem.encode_kb(self.kb) @@ -1049,9 +1054,7 @@ def _fitness_func(self, individual: Tree): self._cache[ind_str] = (e.q, individual.fitness.values[0]) self._number_of_tested_concepts += 1 - def clean(self): - self._result_population = None - + def clean(self, partial: bool = False): # Resets classes if they already exist, names must match the ones that were created in the toolbox try: del creator.Fitness @@ -1059,8 +1062,22 @@ def clean(self): del creator.Quality except AttributeError: pass - self._cache.clear() super().clean() + if not partial: + # Reset everything if fitting more than one lp. Tests have shown that this is necessary to get the + # best performance of EvoLearner. + self._result_population = None + self._cache.clear() + self.fitness_func = LinearPressureFitness() + self.init_method = EARandomWalkInitialization() + self.algorithm = EASimple() + self.mut_uniform_gen = EARandomInitialization(min_height=1, max_height=3) + self.value_splitter = EntropyValueSplitter() + self._dp_to_prim_type = dict() + self._dp_splits = dict() + self._split_properties = [] + self.pset = self.__build_primitive_set() + self.toolbox = self.__build_toolbox() class NCES(BaseNCES): diff --git a/tests/test_evolearner.py b/tests/test_evolearner.py index 70d64062..4bd0b0a9 100644 --- a/tests/test_evolearner.py +++ b/tests/test_evolearner.py @@ -18,9 +18,9 @@ def test_regression_family(self): kb = KnowledgeBase(path=settings['data_path'][3:]) model = EvoLearner(knowledge_base=kb, max_runtime=10) - regression_test_evolearner = {'Aunt': 0.9, 'Brother': 1.0, - 'Cousin': 0.9, 'Granddaughter': 1.0, - 'Uncle': 0.9, 'Grandgrandfather': 0.94} + regression_test_evolearner = {'Aunt': 1.0, 'Brother': 1.0, + 'Cousin': 1.0, 'Granddaughter': 1.0, + 'Uncle': 1.0, 'Grandgrandfather': 1.0} for str_target_concept, examples in settings['problems'].items(): pos = set(map(OWLNamedIndividual, map(IRI.create, set(examples['positive_examples'])))) neg = set(map(OWLNamedIndividual, map(IRI.create, set(examples['negative_examples'])))) @@ -31,8 +31,12 @@ def test_regression_family(self): self.assertEqual(returned_model, model) hypotheses = list(returned_model.best_hypotheses(n=3)) self.assertGreaterEqual(hypotheses[0].quality, regression_test_evolearner[str_target_concept]) - self.assertGreaterEqual(hypotheses[0].quality, hypotheses[1].quality) - self.assertGreaterEqual(hypotheses[1].quality, hypotheses[2].quality) + # best_hypotheses returns distinct hypotheses and sometimes the model will not find 'n' distinct hypothesis, + # hence the checks + if len(hypotheses) == 2: + self.assertGreaterEqual(hypotheses[0].quality, hypotheses[1].quality) + if len(hypotheses) == 3: + self.assertGreaterEqual(hypotheses[1].quality, hypotheses[2].quality) def test_regression_mutagenesis_multiple_fits(self): kb = KnowledgeBase(path='KGs/Mutagenesis/mutagenesis.owl') From ae62f8e369bb6a934f72d30358b53ceac5460c7c Mon Sep 17 00:00:00 2001 From: Jean-KOUAGOU Date: Mon, 4 Mar 2024 14:17:00 +0100 Subject: [PATCH 6/8] added CLIP --- README.md | 52 ++-- examples/clip_notebook.ipynb | 234 ++++++++++++++++++ examples/concept_learning_cv_evaluation.py | 64 ++++- ontolearn/clip_architectures.py | 117 +++++++++ ontolearn/clip_trainer.py | 138 +++++++++++ ontolearn/concept_learner.py | 274 ++++++++++++++++++++- ontolearn/data_struct.py | 82 +++++- ontolearn/nces_trainer.py | 10 +- 8 files changed, 919 insertions(+), 52 deletions(-) create mode 100644 examples/clip_notebook.ipynb create mode 100644 ontolearn/clip_architectures.py create mode 100644 ontolearn/clip_trainer.py diff --git a/README.md b/README.md index 8a445c06..1c78cae1 100644 --- a/README.md +++ b/README.md @@ -109,44 +109,44 @@ Note that F1 scores denote the quality of the find/constructed concept w.r.t. E^ ### Family Benchmark Results -| LP | Train-F1-OCEL | Test-F1-OCEL | RT-OCEL | Train-F1-CELOE | Test-F1-CELOE | RT-CELOE | Train-F1-Evo | Test-F1-Evo | RT-Evo | Train-F1-DRILL | Test-F1-DRILL | RT-DRILL | Train-F1-TDL | Test-F1-TDL | RT-TDL | Train-F1-NCES | Test-F1-NCES | RT-NCES | -|:-------------------|----------------:|---------------:|----------:|-----------------:|----------------:|-----------:|---------------:|--------------:|---------:|-----------------:|----------------:|-----------:|---------------:|--------------:|---------:|----------------:|---------------:|----------:| -| Aunt | 0.848 | 0.637 | 8.923 | 0.918 | 0.855 | 8.923 | 1.000 | 0.986 | 1.849 | 0.868 | 0.820 | 10.195 | 0.960 | 0.960 | 7.214 | 0.715 | 0.712 | 0.363 | -| Brother | 1.000 | 1.000 | 0.009 | 1.000 | 1.000 | 0.009 | 1.000 | 1.000 | 0.380 | 1.000 | 1.000 | 0.011 | 1.000 | 1.000 | 7.018 | 0.946 | 0.967 | 0.337 | -| Cousin | 0.740 | 0.708 | 7.096 | 0.796 | 0.789 | 7.096 | 1.000 | 0.993 | 2.247 | 0.826 | 0.779 | 10.150 | 0.977 | 0.951 | 8.215 | 0.667 | 0.667 | 0.333 | -| Daughter | 1.000 | 1.000 | 0.008 | 1.000 | 1.000 | 0.008 | 1.000 | 1.000 | 0.331 | 1.000 | 1.000 | 0.013 | 1.000 | 1.000 | 7.410 | 0.992 | 0.983 | 0.316 | -| Father | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.422 | 1.000 | 1.000 | 0.005 | 1.000 | 1.000 | 7.369 | 0.937 | 0.935 | 0.290 | -| Granddaughter | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.361 | 1.000 | 1.000 | 0.004 | 1.000 | 1.000 | 7.254 | 0.924 | 0.941 | 0.320 | -| Grandfather | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.347 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 7.180 | 0.709 | 0.727 | 0.334 | -| Grandgranddaughter | 1.000 | 1.000 | 0.005 | 1.000 | 1.000 | 0.005 | 1.000 | 1.000 | 0.285 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 6.651 | 0.860 | 0.873 | 0.300 | -| Grandgrandfather | 1.000 | 1.000 | 0.464 | 1.000 | 1.000 | 0.464 | 1.000 | 1.000 | 0.287 | 1.000 | 1.000 | 0.116 | 0.953 | 0.947 | 6.656 | 0.768 | 0.793 | 0.337 | -| Grandgrandmother | 1.000 | 1.000 | 3.018 | 1.000 | 1.000 | 3.018 | 1.000 | 1.000 | 0.274 | 1.000 | 1.000 | 0.116 | 0.944 | 0.947 | 6.713 | 0.706 | 0.703 | 0.308 | -| Grandgrandson | 1.000 | 1.000 | 1.127 | 1.000 | 1.000 | 1.127 | 1.000 | 1.000 | 0.346 | 1.000 | 1.000 | 0.025 | 0.940 | 0.911 | 6.945 | 0.860 | 0.909 | 0.289 | -| Grandmother | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.365 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 7.033 | 0.761 | 0.764 | 0.299 | -| Grandson | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.394 | 1.000 | 1.000 | 0.004 | 1.000 | 1.000 | 7.071 | 0.908 | 0.924 | 0.342 | -| Mother | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.412 | 1.000 | 1.000 | 0.005 | 1.000 | 1.000 | 7.524 | 0.977 | 0.978 | 0.343 | -| PersonWithASibling | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.365 | 0.737 | 0.725 | 10.203 | 1.000 | 1.000 | 7.473 | 0.925 | 0.941 | 0.349 | -| Sister | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.341 | 1.000 | 1.000 | 0.009 | 1.000 | 1.000 | 7.038 | 0.879 | 0.894 | 0.330 | -| Son | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 0.349 | 1.000 | 1.000 | 0.004 | 1.000 | 1.000 | 7.232 | 0.927 | 0.893 | 0.316 | -| Uncle | 0.903 | 0.891 | 10.118 | 0.907 | 0.891 | 10.118 | 1.000 | 0.967 | 1.451 | 0.928 | 0.908 | 10.089 | 0.926 | 0.918 | 7.160 | 0.688 | 0.693 | 0.345 | +| LP | Train-F1-OCEL | Test-F1-OCEL | RT-OCEL | Train-F1-CELOE | Test-F1-CELOE | RT-CELOE | Train-F1-Evo | Test-F1-Evo | RT-Evo | Train-F1-DRILL | Test-F1-DRILL | RT-DRILL | Train-F1-TDL | Test-F1-TDL | RT-TDL | Train-F1-NCES | Test-F1-NCES | RT-NCES | Train-F1-CLIP | Test-F1-CLIP | RT-CLIP | +|:-------------------|----------------:|---------------:|----------:|-----------------:|----------------:|-----------:|---------------:|--------------:|---------:|-----------------:|----------------:|-----------:|---------------:|--------------:|---------:|----------------:|---------------:|----------:|----------------:|---------------:|----------:| +| Aunt | 0.848 | 0.637 | 9.206 | 0.918 | 0.855 | 9.206 | 0.996 | 0.969 | 3.390 | 0.886 | 0.799 | 60.243 | 0.971 | 0.949 | 6.366 | 0.721 | 0.635 | 0.552 | 0.899 | 0.891 | 5.763 | +| Brother | 1.000 | 1.000 | 0.005 | 1.000 | 1.000 | 0.005 | 1.000 | 1.000 | 0.281 | 1.000 | 1.000 | 0.020 | 1.000 | 1.000 | 6.216 | 0.978 | 0.975 | 0.450 | 1.000 | 1.000 | 0.692 | +| Cousin | 0.740 | 0.708 | 7.336 | 0.796 | 0.789 | 7.336 | 1.000 | 1.000 | 1.653 | 0.831 | 0.784 | 60.416 | 0.978 | 0.941 | 7.073 | 0.667 | 0.667 | 0.465 | 0.774 | 0.761 | 6.671 | +| Daughter | 1.000 | 1.000 | 0.006 | 1.000 | 1.000 | 0.006 | 1.000 | 1.000 | 0.309 | 1.000 | 1.000 | 0.033 | 1.000 | 1.000 | 6.459 | 0.993 | 0.977 | 0.534 | 1.000 | 1.000 | 0.716 | +| Father | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.411 | 1.000 | 1.000 | 0.004 | 1.000 | 1.000 | 6.522 | 0.897 | 0.903 | 0.448 | 1.000 | 1.000 | 0.588 | +| Granddaughter | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.320 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 6.233 | 0.911 | 0.916 | 0.497 | 1.000 | 1.000 | 0.646 | +| Grandfather | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.314 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 6.185 | 0.743 | 0.717 | 0.518 | 1.000 | 1.000 | 0.721 | +| Grandgranddaughter | 1.000 | 1.000 | 0.004 | 1.000 | 1.000 | 0.004 | 1.000 | 1.000 | 0.293 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 5.858 | 0.837 | 0.840 | 0.518 | 1.000 | 1.000 | 0.710 | +| Grandgrandfather | 1.000 | 1.000 | 0.668 | 1.000 | 1.000 | 0.668 | 1.000 | 1.000 | 0.341 | 1.000 | 1.000 | 0.243 | 0.951 | 0.947 | 5.915 | 0.759 | 0.677 | 0.511 | 1.000 | 1.000 | 1.964 | +| Grandgrandmother | 1.000 | 1.000 | 0.381 | 1.000 | 1.000 | 0.381 | 1.000 | 1.000 | 0.258 | 1.000 | 1.000 | 0.243 | 0.944 | 0.947 | 5.918 | 0.721 | 0.687 | 0.498 | 0.997 | 1.000 | 2.620 | +| Grandgrandson | 1.000 | 1.000 | 0.341 | 1.000 | 1.000 | 0.341 | 1.000 | 1.000 | 0.276 | 1.000 | 1.000 | 0.122 | 0.938 | 0.911 | 6.093 | 0.779 | 0.809 | 0.460 | 1.000 | 1.000 | 2.555 | +| Grandmother | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.385 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 6.135 | 0.762 | 0.725 | 0.480 | 1.000 | 1.000 | 0.628 | +| Grandson | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.299 | 1.000 | 1.000 | 0.003 | 1.000 | 1.000 | 6.301 | 0.896 | 0.903 | 0.552 | 1.000 | 1.000 | 0.765 | +| Mother | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.327 | 1.000 | 1.000 | 0.004 | 1.000 | 1.000 | 6.570 | 0.967 | 0.972 | 0.555 | 1.000 | 1.000 | 0.779 | +| PersonWithASibling | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.377 | 0.737 | 0.725 | 60.194 | 1.000 | 1.000 | 6.548 | 0.927 | 0.928 | 0.648 | 1.000 | 1.000 | 0.999 | +| Sister | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.356 | 1.000 | 1.000 | 0.017 | 1.000 | 1.000 | 6.315 | 0.866 | 0.876 | 0.512 | 1.000 | 1.000 | 0.616 | +| Son | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.002 | 1.000 | 1.000 | 0.317 | 1.000 | 1.000 | 0.004 | 1.000 | 1.000 | 6.579 | 0.892 | 0.855 | 0.537 | 1.000 | 1.000 | 0.700 | +| Uncle | 0.903 | 0.891 | 12.441 | 0.907 | 0.891 | 12.441 | 1.000 | 0.971 | 1.675 | 0.951 | 0.894 | 60.337 | 0.894 | 0.896 | 6.310 | 0.667 | 0.665 | 0.619 | 0.928 | 0.942 | 5.577 | ### Mutagenesis Benchmark Results ```shell python examples/concept_learning_evaluation.py --lps LPs/Mutagenesis/lps.json --kb KGs/Mutagenesis/mutagenesis.owl --max_runtime 60 --report mutagenesis_results.csv && python -c 'import pandas as pd; print(pd.read_csv("mutagenesis_results.csv", index_col=0).to_markdown(floatfmt=".3f"))' ``` -| LP | F1-OCEL | RT-OCEL | F1-CELOE | RT-CELOE | F1-Evo | RT-Evo | F1-DRILL | RT-DRILL | F1-TDL | RT-TDL | -|:---------|----------:|----------:|-----------:|-----------:|---------:|---------:|-----------:|-----------:|---------:|---------:| -| NotKnown | 0.916 | 60.226 | 0.916 | 41.243 | 0.976 | 40.411 | 0.704 | 60.044 | 1.000 | 49.022 | +| LP | Train-F1-OCEL | Test-F1-OCEL | RT-OCEL | Train-F1-CELOE | Test-F1-CELOE | RT-CELOE | Train-F1-Evo | Test-F1-Evo | RT-Evo | Train-F1-DRILL | Test-F1-DRILL | RT-DRILL | Train-F1-TDL | Test-F1-TDL | RT-TDL | Train-F1-NCES | Test-F1-NCES | RT-NCES | Train-F1-CLIP | Test-F1-CLIP | RT-CLIP | +|:---------|----------------:|---------------:|----------:|-----------------:|----------------:|-----------:|---------------:|--------------:|---------:|-----------------:|----------------:|-----------:|---------------:|--------------:|---------:|----------------:|---------------:|----------:|----------------:|---------------:|----------:| +| NotKnown | 0.916 | 0.918 | 58.328 | 0.916 | 0.918 | 58.328 | 0.724 | 0.729 | 49.281 | 0.704 | 0.704 | 60.052 | 0.879 | 0.771 | 7.763 | 0.564 | 0.560 | 0.493 | 0.814 | 0.807 | 5.622 | ### Carcinogenesis Benchmark Results ```shell python examples/concept_learning_evaluation.py --lps LPs/Carcinogenesis/lps.json --kb KGs/Carcinogenesis/carcinogenesis.owl --max_runtime 60 --report carcinogenesis_results.csv && python -c 'import pandas as pd; print(pd.read_csv("carcinogenesis_results.csv", index_col=0).to_markdown(floatfmt=".3f"))' ``` -| LP | F1-OCEL | RT-OCEL | F1-CELOE | RT-CELOE | F1-Evo | RT-Evo | F1-DRILL | RT-DRILL | F1-TDL | RT-TDL | -|:---------|----------:|----------:|-----------:|-----------:|---------:|---------:|-----------:|-----------:|---------:|---------:| -| NOTKNOWN | 0.739 | 64.975 | 0.739 | 60.004 | 0.814 | 60.758 | 0.705 | 60.066 | 1.000 | 56.701 | +| LP | Train-F1-OCEL | Test-F1-OCEL | RT-OCEL | Train-F1-CELOE | Test-F1-CELOE | RT-CELOE | Train-F1-Evo | Test-F1-Evo | RT-Evo | Train-F1-DRILL | Test-F1-DRILL | RT-DRILL | Train-F1-TDL | Test-F1-TDL | RT-TDL | Train-F1-NCES | Test-F1-NCES | RT-NCES | Train-F1-CLIP | Test-F1-CLIP | RT-CLIP | +|:---------|----------------:|---------------:|----------:|-----------------:|----------------:|-----------:|---------------:|--------------:|---------:|-----------------:|----------------:|-----------:|---------------:|--------------:|---------:|----------------:|---------------:|----------:|----------------:|---------------:|----------:| +| NOTKNOWN | 0.738 | 0.711 | 42.936 | 0.740 | 0.701 | 42.936 | 0.744 | 0.733 | 63.465 | 0.705 | 0.704 | 60.069 | 0.879 | 0.682 | 7.260 | 0.415 | 0.396 | 1.911 | 0.720 | 0.700 | 85.037 | diff --git a/examples/clip_notebook.ipynb b/examples/clip_notebook.ipynb new file mode 100644 index 00000000..bc98619b --- /dev/null +++ b/examples/clip_notebook.ipynb @@ -0,0 +1,234 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "blond-letter", + "metadata": {}, + "source": [ + "# CLIP Notebook\n", + "This is a jupyter notebook file to execute [CLIP](ontolearn.concept_learner.CLIP) and generate predictive results. We recommend you to see the [concept learners](../docs/usage/06_concept_learners.md) guide before continuing with the execution.\n", + "Also if you have not done it already, from the main directory \"Ontolearn\", run the commands for Datasets mentioned [here](https://ontolearn-docs-dice-group.netlify.app/usage/02_installation#download-external-files) to download the datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "japanese-ivory", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Warning: SQLite3 version 3.40.0 and 3.41.2 have huge performance regressions; please install version 3.41.1 or 3.42!\n", + "\n" + ] + } + ], + "source": [ + "import json\n", + "from ontolearn.knowledge_base import KnowledgeBase\n", + "from ontolearn.concept_learner import CLIP\n", + "from ontolearn.refinement_operators import ExpressRefinement\n", + "from ontolearn.learning_problem import PosNegLPStandard\n", + "from owlapy.model import OWLNamedIndividual, IRI\n", + "from ontolearn.utils import setup_logging\n" + ] + }, + { + "cell_type": "markdown", + "id": "pending-coast", + "metadata": {}, + "source": [ + "Open `uncle_lp.json` where we have stored the learning problem for the concept of 'Uncle' and the path to the 'family' ontology." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "beginning-syntax", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('uncle_lp.json') as json_file:\n", + " settings = json.load(json_file)" + ] + }, + { + "cell_type": "markdown", + "id": "humanitarian-heating", + "metadata": {}, + "source": [ + "Create an instance of the class `KnowledeBase` by using the path that is stored in `settings`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "caroline-indiana", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "kb = KnowledgeBase(path=settings['data_path'])" + ] + }, + { + "cell_type": "markdown", + "id": "lucky-activation", + "metadata": {}, + "source": [ + "Retreive the IRIs of the positive and negative examples of Uncle from `settings` and create an instance of `PosNegLPStandard`. (more info about this [here](../docs/usage/06_concept_learners.md#configure-the-learning-problem))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "processed-patrick", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "examples = settings['Uncle']\n", + "p = set(examples['positive_examples'])\n", + "n = set(examples['negative_examples'])\n", + "typed_pos = set(map(OWLNamedIndividual, map(IRI.create, p)))\n", + "typed_neg = set(map(OWLNamedIndividual, map(IRI.create, n)))\n", + "lp = PosNegLPStandard(pos=typed_pos, neg=typed_neg)" + ] + }, + { + "cell_type": "markdown", + "id": "mechanical-latin", + "metadata": {}, + "source": [ + "Create a model of [CLIP](ontolearn.concept_learner.CLIP) and fit the learning problem to the model." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "171d1aa4-6c12-42c0-b7e9-8cf2dce85ff9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "op = ExpressRefinement(knowledge_base=kb, use_inverse=False,\n", + " use_numeric_datatypes=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "binding-moderator", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Loaded length predictor!\n", + "\n", + " Loaded length predictor!\n", + "\n", + " Loaded length predictor!\n", + "\n", + " Loaded length predictor!\n", + "\n", + "***** Predicted length: 5 *****\n", + "\n", + "***** Predicted length: 5 *****\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = CLIP(knowledge_base=kb, path_of_embeddings=\"../CLIPData/family/embeddings/ConEx_entity_embeddings.csv\",\n", + " refinement_operator=op, load_pretrained=True, max_runtime=200)\n", + "model.fit(lp)" + ] + }, + { + "cell_type": "markdown", + "id": "d981f2b9-3489-494e-825d-6a72ee480d4f", + "metadata": {}, + "source": [ + "## Retrieve top 3 hypotheses and print them." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c6a90b21-3594-441d-bed0-eb822db5f993", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " at 0x0304774\tMale ⊓ (∀ hasParent.Grandparent)\tQuality:0.90476\tHeuristic:0.40407\tDepth:2\tH_exp:6\t|RC|:7\t|Indv.|:None\n", + " at 0x0ca154a\tMale ⊓ (∀ hasChild.Grandchild)\tQuality:0.90476\tHeuristic:0.36919\tDepth:1\tH_exp:7\t|RC|:7\t|Indv.|:None\n", + " at 0x2adbb89\tMale ⊓ (∀ hasChild.(¬Grandfather))\tQuality:0.88889\tHeuristic:0.39044\tDepth:3\tH_exp:6\t|RC|:0\t|Indv.|:None\n" + ] + }, + { + "data": { + "text/plain": [ + "[None, None, None]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hypotheses = list(model.best_hypotheses(n=3))\n", + "[print(_) for _ in hypotheses]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "onto", + "language": "python", + "name": "onto" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/concept_learning_cv_evaluation.py b/examples/concept_learning_cv_evaluation.py index acab30f0..c9fb4c61 100644 --- a/examples/concept_learning_cv_evaluation.py +++ b/examples/concept_learning_cv_evaluation.py @@ -12,7 +12,8 @@ import time import pandas as pd from ontolearn.knowledge_base import KnowledgeBase -from ontolearn.concept_learner import CELOE, OCEL, EvoLearner, NCES +from ontolearn.concept_learner import CELOE, OCEL, EvoLearner, NCES, CLIP +from ontolearn.refinement_operators import ExpressRefinement from ontolearn.learners import Drill, TDL from ontolearn.learning_problem import PosNegLPStandard from ontolearn.metrics import F1 @@ -32,13 +33,13 @@ def dl_concept_learning(args): settings = json.load(json_file) kb = KnowledgeBase(path=args.kb) - ocel = OCEL(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), + ocel = OCEL(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) - celoe = CELOE(knowledge_base=KnowledgeBase(path=args.kb), quality_func=F1(), + celoe = CELOE(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) - drill = Drill(knowledge_base=KnowledgeBase(path=args.kb), path_pretrained_kge=args.path_pretrained_kge, + drill = Drill(knowledge_base=kb, path_pretrained_kge=args.path_pretrained_kge, quality_func=F1(), max_runtime=args.max_runtime) - tdl = TDL(knowledge_base=KnowledgeBase(path=args.kb), + tdl = TDL(knowledge_base=kb, dataframe_triples=pd.DataFrame( data=sorted([(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)], key=lambda x: len(x)), columns=['subject', 'relation', 'object'], dtype=str), @@ -46,13 +47,27 @@ def dl_concept_learning(args): max_runtime=args.max_runtime) nces = NCES(knowledge_base_path=args.kb, quality_func=F1(), path_of_embeddings=args.path_of_nces_embeddings, pretrained_model_name=["LSTM", "GRU", "SetTransformer"], num_predictions=5) + + express_rho = ExpressRefinement(kb, use_inverse=False, use_numeric_datatypes=False) + clip = CLIP(knowledge_base=kb, refinement_operator=express_rho, quality_func=F1(), + max_num_of_concepts_tested=int(1e9), max_runtime=args.max_runtime, + path_of_embeddings=args.path_of_clip_embeddings, + pretrained_predictor_name=["LSTM", "GRU", "SetTransformer", "CNN"], load_pretrained=True) # dictionary to store the data data = dict() - for str_target_concept, examples in settings['problems'].items(): + if "problems" in settings: + problems = settings['problems'].items() + positives_key = "positive_examples" + negatives_key = "negative_examples" + else: + problems = settings.items() + positives_key = "positive examples" + negatives_key = "negative examples" + for str_target_concept, examples in problems: print('Target concept: ', str_target_concept) - p = examples['positive_examples'] - n = examples['negative_examples'] + p = examples[positives_key] + n = examples[negatives_key] kf = StratifiedKFold(n_splits=args.folds, shuffle=True, random_state=args.random_seed) X = np.array(p + n) @@ -67,16 +82,16 @@ def dl_concept_learning(args): train_neg = {neg_individual for neg_individual in X[train_index][y[train_index] == 0]} # Sanity checking for individuals used for training. - assert train_pos.issubset(examples['positive_examples']) - assert train_neg.issubset(examples['negative_examples']) + assert train_pos.issubset(examples[positives_key]) + assert train_neg.issubset(examples[negatives_key]) # () Extract positive and negative examples from test fold test_pos = {pos_individual for pos_individual in X[test_index][y[test_index] == 1]} test_neg = {neg_individual for neg_individual in X[test_index][y[test_index] == 0]} # Sanity checking for individuals used for testing. - assert test_pos.issubset(examples['positive_examples']) - assert test_neg.issubset(examples['negative_examples']) + assert test_pos.issubset(examples[positives_key]) + assert test_neg.issubset(examples[negatives_key]) train_lp = PosNegLPStandard(pos=set(map(OWLNamedIndividual, map(IRI.create, train_pos))), neg=set(map(OWLNamedIndividual, map(IRI.create, train_neg)))) @@ -217,6 +232,28 @@ def dl_concept_learning(args): print(f"NCES Train Quality: {train_f1_nces:.3f}", end="\t") print(f"NCES Test Quality: {test_f1_nces:.3f}", end="\t") print(f"NCES Runtime: {rt_nces:.3f}") + + + print("CLIP starts..", end="\t") + start_time = time.time() + pred_clip = clip.fit(train_lp).best_hypotheses(n=1) + rt_clip = time.time() - start_time + print("CLIP ends..", end="\t") + # () Quality on the training data + train_f1_clip = compute_f1_score(individuals={i for i in kb.individuals(pred_clip.concept)}, + pos=train_lp.pos, + neg=train_lp.neg) + # () Quality on test data + test_f1_clip = compute_f1_score(individuals={i for i in kb.individuals(pred_clip.concept)}, + pos=test_lp.pos, + neg=test_lp.neg) + + data.setdefault("Train-F1-CLIP", []).append(train_f1_clip) + data.setdefault("Test-F1-CLIP", []).append(test_f1_clip) + data.setdefault("RT-CLIP", []).append(rt_clip) + print(f"CLIP Train Quality: {train_f1_clip:.3f}", end="\t") + print(f"CLIP Test Quality: {test_f1_clip:.3f}", end="\t") + print(f"CLIP Runtime: {rt_clip:.3f}") df = pd.DataFrame.from_dict(data) df.to_csv(args.report, index=False) @@ -227,12 +264,13 @@ def dl_concept_learning(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Description Logic Concept Learning') parser.add_argument("--max_runtime", type=int, default=10, help="Max runtime") - parser.add_argument("--lps", type=str, required=True, help="Path fto the learning problems") + parser.add_argument("--lps", type=str, required=True, help="Path to the learning problems") parser.add_argument("--folds", type=int, default=10, help="Number of folds of cross validation.") parser.add_argument("--kb", type=str, required=True, help="Knowledge base") parser.add_argument("--path_pretrained_kge", type=str, default=None) parser.add_argument("--path_of_nces_embeddings", type=str, default=None) + parser.add_argument("--path_of_clip_embeddings", type=str, default=None) parser.add_argument("--report", type=str, default="report.csv") parser.add_argument("--random_seed", type=int, default=1) dl_concept_learning(parser.parse_args()) \ No newline at end of file diff --git a/ontolearn/clip_architectures.py b/ontolearn/clip_architectures.py new file mode 100644 index 00000000..ac210f6a --- /dev/null +++ b/ontolearn/clip_architectures.py @@ -0,0 +1,117 @@ +import torch, torch.nn as nn +import random +from typing import List +from ontolearn.nces_modules import * + +class LengthLearner_LSTM(nn.Module): + """LSTM architecture""" + def __init__(self, input_size, output_size, proj_dim=256, rnn_n_layers=2, drop_prob=0.2): + super().__init__() + self.name = 'LSTM' + self.loss = nn.CrossEntropyLoss() + self.lstm = nn.LSTM(input_size, proj_dim, rnn_n_layers, + dropout=drop_prob, batch_first=True) + self.dropout = nn.Dropout(drop_prob) + self.fc1 = nn.Linear(2*proj_dim, proj_dim) + self.fc2 = nn.Linear(proj_dim, proj_dim) + self.fc3 = nn.Linear(proj_dim, output_size) + + def forward(self, x1, x2): + ''' Forward pass through the network.''' + x1, _ = self.lstm(x1) + x1 = x1.sum(1).contiguous().view(x1.shape[0], -1) + x2, _ = self.lstm(x2) + x2 = x2.sum(1).contiguous().view(x2.shape[0], -1) + x = torch.cat([x1, x2], dim=-1) + x = self.fc1(x) + x = torch.selu(x) + x = self.dropout(x) + x = self.fc2(x) + x = x + torch.tanh(x) + x = self.fc3(x) + return x + +class LengthLearner_GRU(nn.Module): + """GRU architecture""" + def __init__(self, input_size, output_size, proj_dim=256, rnn_n_layers=2, drop_prob=0.2): + super().__init__() + self.name = 'GRU' + self.loss = nn.CrossEntropyLoss() + self.gru = nn.GRU(input_size, proj_dim, rnn_n_layers, + dropout=drop_prob, batch_first=True) + self.dropout = nn.Dropout(drop_prob) + self.fc1 = nn.Linear(2*proj_dim, proj_dim) + self.fc2 = nn.Linear(proj_dim, proj_dim) + self.fc3 = nn.Linear(proj_dim, output_size) + + def forward(self, x1, x2): + ''' Forward pass through the network.''' + x1, _ = self.gru(x1) + x1 = x1.sum(1).contiguous().view(x1.shape[0], -1) + x2, _ = self.gru(x2) + x2 = x2.sum(1).contiguous().view(x2.shape[0], -1) + x = torch.cat([x1, x2], dim=-1) + x = self.fc1(x) + x = torch.selu(x) + x = self.dropout(x) + x = self.fc2(x) + x = x + torch.tanh(x) + x = self.fc3(x) + return x + + +class LengthLearner_CNN(nn.Module): + """CNN architecture""" + def __init__(self, input_size, output_size, num_examples, proj_dim=256, kernel_size: list=[[5,7], [5,7]], stride: list=[[3,3], [3,3]], drop_prob=0.2): + super().__init__() + assert isinstance(kernel_size, list) and isinstance(kernel_size[0], list), "kernel size and stride must be lists of lists, e.g., [[5,7], [5,7]]" + self.name = 'CNN' + self.loss = nn.CrossEntropyLoss() + self.conv1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=(kernel_size[0][0], kernel_size[0][1]), stride=(stride[0][0], stride[0][1]), padding=(0,0)) + self.conv2 = nn.Conv2d(in_channels=4, out_channels=8, kernel_size=(kernel_size[1][0], kernel_size[1][1]), stride=(stride[1][0], stride[1][1]), padding=(0,0)) + self.dropout1d = nn.Dropout(drop_prob) + self.dropout2d = nn.Dropout2d(drop_prob) + conv_out_dim = 3536 + self.fc1 = nn.Linear(conv_out_dim, proj_dim) + self.fc2 = nn.Linear(proj_dim, proj_dim) + self.fc3 = nn.Linear(proj_dim, output_size) + + def forward(self, x1, x2): + ''' Forward pass through the network.''' + x1 = x1.unsqueeze(1) + x2 = x2.unsqueeze(1) + x = torch.cat([x1, x2], dim=-2) + x = self.conv1(x) + x = torch.selu(x) + x = self.dropout2d(x) + x = self.conv2(x) + x = x.view(x.shape[0], -1) + x = self.fc1(x) + x = torch.selu(x) + x = self.dropout1d(x) + x = self.fc2(x) + x = x + torch.tanh(x) + x = self.fc3(x) + return x + + +class LengthLearner_SetTransformer(nn.Module): + """SetTransformer architecture.""" + def __init__(self, input_size, output_size, proj_dim=256, num_heads=4, num_seeds=1, num_inds=32): + super().__init__() + self.name = 'SetTransformer' + self.loss = nn.CrossEntropyLoss() + self.enc = nn.Sequential( + ISAB(input_size, proj_dim, num_heads, num_inds), + ISAB(proj_dim, proj_dim, num_heads, num_inds)) + self.dec = nn.Sequential( + PMA(proj_dim, num_heads, num_seeds), + nn.Linear(proj_dim, output_size)) + + def forward(self, x1, x2): + ''' Forward pass through the network.''' + x1 = self.enc(x1) + x2 = self.enc(x2) + x = torch.cat([x1, x2], dim=-2) + x = self.dec(x).squeeze() + return x \ No newline at end of file diff --git a/ontolearn/clip_trainer.py b/ontolearn/clip_trainer.py new file mode 100644 index 00000000..96421524 --- /dev/null +++ b/ontolearn/clip_trainer.py @@ -0,0 +1,138 @@ +import numpy as np +import copy +import torch +from tqdm import trange +from collections import defaultdict +import os +import json +from torch.optim.lr_scheduler import ExponentialLR +from torch.nn import functional as F +from torch.nn.utils import clip_grad_value_ +from torch.nn.utils.rnn import pad_sequence +from sklearn.metrics import f1_score, accuracy_score +import time + + + +class CLIPTrainer: + """CLIP trainer.""" + def __init__(self, clip, epochs=300, learning_rate=1e-4, decay_rate=0, clip_value=5.0, + storage_path="./"): + self.clip = clip + self.epochs = epochs + self.learning_rate = learning_rate + self.decay_rate = decay_rate + self.clip_value = clip_value + self.storage_path = storage_path + + def compute_eval_metric(self, target, prediction): + f1 = 100*f1_score(target, prediction, average="micro") + acc = 100*accuracy_score(target, prediction) + return f1, acc + + def get_optimizer(self, length_predictor, optimizer='Adam'): + if optimizer == 'Adam': + return torch.optim.Adam(length_predictor.parameters(), lr=self.learning_rate) + elif optimizer == 'SGD': + return torch.optim.SGD(length_predictor.parameters(), lr=self.learning_rate) + elif optimizer == 'RMSprop': + return torch.optim.RMSprop(length_predictor.parameters(), lr=self.learning_rate) + else: + raise ValueError + print('Unsupported optimizer') + + def show_num_learnable_params(self): + print("*"*20+"Trainable model size"+"*"*20) + size = sum([p.numel() for p in self.clip.length_predictor.parameters()]) + size_ = 0 + print("Length Predictor: ", size) + print("*"*20+"Trainable model size"+"*"*20) + print() + return size + + def train(self, train_dataloader, save_model=True, optimizer='Adam', record_runtime=True): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if isinstance(self.clip.length_predictor, list): + self.clip.length_predictor = copy.deepcopy(self.clip.length_predictor[0]) + model_size = self.show_num_learnable_params() + if device.type == "cpu": + print("Training on CPU, it may take long...") + else: + print("GPU available !") + print() + print("#"*50) + print() + print("{} starts training... \n".format(self.clip.length_predictor.name)) + print("#"*50, "\n") + length_predictor = copy.deepcopy(self.clip.length_predictor).train() + desc = length_predictor.name + if device.type == "cuda": + length_predictor.cuda() + opt = self.get_optimizer(length_predictor=length_predictor, optimizer=optimizer) + if self.decay_rate: + self.scheduler = ExponentialLR(opt, self.decay_rate) + Train_loss = [] + F1, Acc = [], [] + best_score = 0. + if record_runtime: + t0 = time.time() + Epochs = trange(self.epochs, desc=f'Loss: {np.nan}, F1: {np.nan}, Acc: {np.nan}', leave=True) + for e in Epochs: + f1s, accs = [], [] + train_losses = [] + for x1, x2, labels in train_dataloader: + if device.type == "cuda": + x1, x2, labels = x1.cuda(), x2.cuda(), labels.cuda() + scores = length_predictor(x1, x2) + loss = length_predictor.loss(scores, labels) + predictions = scores.argmax(1).detach().cpu().numpy() + f1, acc = self.compute_eval_metric(labels.cpu().numpy(), predictions) + f1s.append(f1) + accs.append(acc) + train_losses.append(loss.item()) + opt.zero_grad() + loss.backward() + clip_grad_value_(length_predictor.parameters(), clip_value=self.clip_value) + opt.step() + if self.decay_rate: + self.scheduler.step() + F1.append(np.mean(f1s)) + Acc.append(np.mean(accs)) + Train_loss.append(np.mean(train_losses)) + Epochs.set_description('Loss: {:.4f}, F1: {:.2f}%, Acc: {:.2f}%'.format(Train_loss[-1], + F1[-1], + Acc[-1])) + Epochs.refresh() + weights = copy.deepcopy(length_predictor.state_dict()) + if Acc and Acc[-1] > best_score: + best_score = Acc[-1] + best_weights = weights + length_predictor.load_state_dict(best_weights) + if record_runtime: + duration = time.time()-t0 + runtime_info = {"Architecture": length_predictor.name, + "Number of Epochs": self.epochs, "Runtime (s)": duration} + if not os.path.exists(self.storage_path+"/runtime/"): + os.mkdir(self.storage_path+"/runtime/") + with open(self.storage_path+"/runtime/runtime"+"_"+desc+".json", "w") as file: + json.dump(runtime_info, file, indent=3) + results_dict = dict() + print("Top performance: loss: {:.4f}, f1: {:.2f}% ... " + "acc: {:.2f}%".format(min(Train_loss), max(F1), max(Acc)), "weights saved based on Acc best score!") + print() + results_dict.update({"Train Max F1": max(F1), "Train Acc": max(Acc), + "Train Min Loss": min(Train_loss)}) + if not os.path.exists(self.storage_path+"/results/"): + os.mkdir(self.storage_path+"/results/") + with open(self.storage_path+"/results/"+"results"+"_"+desc+".json", "w") as file: + json.dump(results_dict, file, indent=3) + if save_model: + if not os.path.exists(self.storage_path+"/trained_models/"): + os.mkdir(self.storage_path+"/trained_models/") + torch.save(length_predictor.state_dict(), self.storage_path+"/trained_models/"+"trained_"+desc+".pt") + print("{} saved".format(length_predictor.name)) + if not os.path.exists(self.storage_path+"/metrics/"): + os.mkdir(self.storage_path+"/metrics/") + with open(self.storage_path+"/metrics/"+"metrics_"+desc+".json", "w") as plot_file: + json.dump({"f1": F1, "acc": Acc, "loss": Train_loss}, plot_file, + indent=3) diff --git a/ontolearn/concept_learner.py b/ontolearn/concept_learner.py index aa74fd4b..a5cf7580 100644 --- a/ontolearn/concept_learner.py +++ b/ontolearn/concept_learner.py @@ -15,6 +15,7 @@ from torch import nn from torch.utils.data import DataLoader from torch.functional import F +from torch.nn.utils.rnn import pad_sequence from torch.nn.init import xavier_normal_ from deap import gp, tools, base, creator @@ -24,7 +25,7 @@ from ontolearn.base_concept_learner import BaseConceptLearner, RefinementBasedConceptLearner from ontolearn.base.owl.utils import EvaluatedDescriptionSet, ConceptOperandSorter, OperandSetTransform from ontolearn.data_struct import PrepareBatchOfTraining, PrepareBatchOfPrediction, NCESDataLoader, \ - NCESDataLoaderInference + NCESDataLoaderInference, CLIPDataLoader, CLIPDataLoaderInference from ontolearn.ea_algorithms import AbstractEvolutionaryAlgorithm, EASimple from ontolearn.ea_initialization import AbstractEAInitialization, EARandomInitialization, EARandomWalkInitialization from ontolearn.ea_utils import PrimitiveFactory, OperatorVocabulary, ToolboxVocabulary, Tree, escape, ind_to_string, \ @@ -33,7 +34,7 @@ from ontolearn.heuristics import OCELHeuristic from ontolearn.learning_problem import PosNegLPStandard, EncodedPosNegLPStandard from ontolearn.metrics import Accuracy, F1 -from ontolearn.refinement_operators import LengthBasedRefinement +from ontolearn.refinement_operators import LengthBasedRefinement, ExpressRefinement from ontolearn.search import EvoLearnerNode, NCESNode, HeuristicOrderedNode, LBLNode, OENode, TreeNode, LengthOrderedNode, \ QualityOrderedNode, RL_State, DRILLSearchTreePriorityQueue, EvaluatedConcept from ontolearn.utils import oplogging, create_experiment_folder @@ -41,7 +42,9 @@ from ontolearn.value_splitter import AbstractValueSplitter, BinningValueSplitter, EntropyValueSplitter from ontolearn.base_nces import BaseNCES from ontolearn.nces_architectures import LSTM, GRU, SetTransformer +from ontolearn.clip_architectures import LengthLearner_LSTM, LengthLearner_GRU, LengthLearner_CNN, LengthLearner_SetTransformer from ontolearn.nces_trainer import NCESTrainer, before_pad +from ontolearn.clip_trainer import CLIPTrainer from ontolearn.nces_utils import SimpleSolution from owlapy.model import OWLClassExpression, OWLDataProperty, OWLLiteral, OWLNamedIndividual, OWLReasoner, OWLClass from owlapy.render import DLSyntaxObjectRenderer @@ -1061,6 +1064,271 @@ def clean(self): pass self._cache.clear() super().clean() + + +class CLIP(CELOE): + """Concept Learner with Integrated Length Prediction. + This algorithm extends the CELOE algorithm by using concept length predictors and a different refinement operator, i.e., ExpressRefinement + + Attributes: + best_descriptions (EvaluatedDescriptionSet[OENode, QualityOrderedNode]): Best hypotheses ordered. + best_only (bool): If False pick only nodes with quality < 1.0, else pick without quality restrictions. + calculate_min_max (bool): Calculate minimum and maximum horizontal expansion? Statistical purpose only. + heuristic_func (AbstractHeuristic): Function to guide the search heuristic. + heuristic_queue (SortedSet[OENode]): A sorted set that compares the nodes based on Heuristic. + iter_bound (int): Limit to stop the algorithm after n refinement steps are done. + kb (KnowledgeBase): The knowledge base that the concept learner is using. + max_child_length (int): Limit the length of concepts generated by the refinement operator. + max_he (int): Maximal value of horizontal expansion. + max_num_of_concepts_tested (int) Limit to stop the algorithm after n concepts tested. + max_runtime (int): Limit to stop the algorithm after n seconds. + min_he (int): Minimal value of horizontal expansion. + name (str): Name of the model = 'celoe_python'. + _number_of_tested_concepts (int): Yes, you got it. This stores the number of tested concepts. + operator (BaseRefinement): Operator used to generate refinements. + quality_func (AbstractScorer) The quality function to be used. + reasoner (OWLReasoner): The reasoner that this model is using. + search_tree (Dict[OWLClassExpression, TreeNode[OENode]]): Dict to store the TreeNode for a class expression. + start_class (OWLClassExpression): The starting class expression for the refinement operation. + start_time (float): The time when :meth:`fit` starts the execution. Used to calculate the total time :meth:`fit` + takes to execute. + terminate_on_goal (bool): Whether to stop the algorithm if a perfect solution is found. + + """ + __slots__ = 'best_descriptions', 'max_he', 'min_he', 'best_only', 'calculate_min_max', 'heuristic_queue', \ + 'search_tree', '_learning_problem', '_max_runtime', '_seen_norm_concepts', 'predictor_name', 'pretrained_predictor_name', \ + 'load_pretrained', 'output_size', 'num_examples', 'path_of_embeddings', 'instance_embeddings', 'input_size', 'device', 'length_predictor', \ + 'num_workers', 'knowledge_base_path' + + name = 'clip' + def __init__(self, + knowledge_base: KnowledgeBase, + knowledge_base_path = '', + reasoner: Optional[OWLReasoner] = None, + refinement_operator: Optional[BaseRefinement[OENode]] = ExpressRefinement, + quality_func: Optional[AbstractScorer] = None, + heuristic_func: Optional[AbstractHeuristic] = None, + terminate_on_goal: Optional[bool] = None, + iter_bound: Optional[int] = None, + max_num_of_concepts_tested: Optional[int] = None, + max_runtime: Optional[int] = None, + max_results: int = 10, + best_only: bool = False, + calculate_min_max: bool = True, + path_of_embeddings="", + predictor_name = None, + pretrained_predictor_name = ["SetTransformer", "LSTM", "GRU", "CNN"], + load_pretrained = False, + num_workers = 4, + num_examples = 1000, + output_size = 15 + ): + super().__init__(knowledge_base, + reasoner, + refinement_operator, + quality_func, + heuristic_func, + terminate_on_goal, + iter_bound, + max_num_of_concepts_tested, + max_runtime, + max_results, + best_only, + calculate_min_max) + assert hasattr(refinement_operator, "expressivity"), f"CLIP was developed to run more efficiently with ExpressRefinement, not {refinement_operator}" + self.predictor_name = predictor_name + self.pretrained_predictor_name = pretrained_predictor_name + self.knowledge_base_path = knowledge_base_path + self.load_pretrained = load_pretrained + self.num_workers = num_workers + self.output_size = output_size + self.num_examples = num_examples + self.path_of_embeddings = path_of_embeddings + assert os.path.isfile(self.path_of_embeddings), '!!! Wrong path for CLIP embeddings' + self.instance_embeddings = pd.read_csv(path_of_embeddings, index_col=0) + self.input_size = self.instance_embeddings.shape[1] + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.length_predictor = self.get_length_predictor() + + def get_length_predictor(self): + def load_model(predictor_name, load_pretrained): + if predictor_name is None: + return [] + if predictor_name == 'SetTransformer': + model = LengthLearner_SetTransformer(self.input_size, self.output_size, proj_dim=256, num_heads=4, num_seeds=1, num_inds=32) + elif predictor_name == 'GRU': + model = LengthLearner_GRU(self.input_size, self.output_size, proj_dim=256, rnn_n_layers=2, drop_prob=0.2) + elif predictor_name == 'LSTM': + model = LengthLearner_LSTM(self.input_size, self.output_size, proj_dim=256, rnn_n_layers=2, drop_prob=0.2) + elif predictor_name == 'CNN': + model = LengthLearner_CNN(self.input_size, self.output_size, self.num_examples, proj_dim=256, kernel_size=[[5,7], [5,7]], stride=[[3,3], [3,3]]) + pretrained_model_path = self.path_of_embeddings.split("embeddings")[0] + "trained_models/trained_" + predictor_name + ".pt" + if load_pretrained and os.path.isfile(pretrained_model_path): + model.load_state_dict(torch.load(pretrained_model_path, map_location=self.device)) + model.eval() + print("\n Loaded length predictor!") + return model + + if not self.load_pretrained: + return [load_model(self.predictor_name, self.load_pretrained)] + elif self.load_pretrained and isinstance(self.pretrained_predictor_name, str): + return [load_model(self.pretrained_predictor_name, self.load_pretrained)] + elif self.load_pretrained and isinstance(self.pretrained_predictor_name, list): + return [load_model(name, self.load_pretrained) for name in self.pretrained_predictor_name] + + def refresh(self): + self.length_predictor = self.get_length_predictor() + + def collate_batch(self, batch): + pos_emb_list = [] + neg_emb_list = [] + target_labels = [] + for pos_emb, neg_emb, label in batch: + if pos_emb.ndim != 2: + pos_emb = pos_emb.reshape(1, -1) + if neg_emb.ndim != 2: + neg_emb = neg_emb.reshape(1, -1) + pos_emb_list.append(pos_emb) + neg_emb_list.append(neg_emb) + target_labels.append(label) + pos_emb_list[0] = F.pad(pos_emb_list[0], (0, 0, 0, self.num_examples - pos_emb_list[0].shape[0]), "constant", 0) + pos_emb_list = pad_sequence(pos_emb_list, batch_first=True, padding_value=0) + neg_emb_list[0] = F.pad(neg_emb_list[0], (0, 0, 0, self.num_examples - neg_emb_list[0].shape[0]), "constant", 0) + neg_emb_list = pad_sequence(neg_emb_list, batch_first=True, padding_value=0) + return pos_emb_list, neg_emb_list, torch.LongTensor(target_labels) + + def collate_batch_inference(self, batch): + pos_emb_list = [] + neg_emb_list = [] + for pos_emb, neg_emb in batch: + if pos_emb.ndim != 2: + pos_emb = pos_emb.reshape(1, -1) + if neg_emb.ndim != 2: + neg_emb = neg_emb.reshape(1, -1) + pos_emb_list.append(pos_emb) + neg_emb_list.append(neg_emb) + pos_emb_list[0] = F.pad(pos_emb_list[0], (0, 0, 0, self.num_examples - pos_emb_list[0].shape[0]), "constant", 0) + pos_emb_list = pad_sequence(pos_emb_list, batch_first=True, padding_value=0) + neg_emb_list[0] = F.pad(neg_emb_list[0], (0, 0, 0, self.num_examples - neg_emb_list[0].shape[0]), "constant", 0) + neg_emb_list = pad_sequence(neg_emb_list, batch_first=True, padding_value=0) + return pos_emb_list, neg_emb_list + + def pos_neg_to_tensor(self, pos: Union[Set[OWLNamedIndividual]], neg: Union[Set[OWLNamedIndividual], Set[str]]): + if isinstance(pos[0], OWLNamedIndividual): + pos_str = [ind.get_iri().as_str().split("/")[-1] for ind in pos][:self.num_examples] + neg_str = [ind.get_iri().as_str().split("/")[-1] for ind in neg][:self.num_examples] + elif isinstance(pos[0], str): + pos_str = pos[:self.num_examples] + neg_str = neg[:self.num_examples] + else: + raise ValueError(f"Invalid input type, was expecting OWLNamedIndividual or str but found {type(pos[0])}") + + assert self.load_pretrained and self.pretrained_predictor_name, \ + "No pretrained model found. Please first train length predictors, see the <> method below" + + dataset = CLIPDataLoaderInference([("", pos_str, neg_str)], self.instance_embeddings, False, False) + dataloader = DataLoader(dataset, batch_size=1, num_workers=self.num_workers, + collate_fn=self.collate_batch_inference, shuffle=False) + x_pos, x_neg = next(iter(dataloader)) + return x_pos, x_neg + + def predict_length(self, models, x1, x2): + for i, model in enumerate(models): + model.eval() + model.to(self.device) + x1 = x1.to(self.device) + x2 = x2.to(self.device) + if i == 0: + scores = model(x1, x2) + else: + sc = model(x1, x2) + scores = scores + sc + scores = scores / len(models) + prediction = int(scores.argmax(1).cpu()) + print(f"\n***** Predicted length: {prediction} *****\n") + return prediction + + def fit(self, *args, **kwargs): + """ + Find hypotheses that explain pos and neg. + """ + self.clean() + max_runtime = kwargs.pop("max_runtime", None) + learning_problem = self.construct_learning_problem(PosNegLPStandard, args, kwargs) + + assert not self.search_tree + self._learning_problem = learning_problem.encode_kb(self.kb) + + if max_runtime is not None: + self._max_runtime = max_runtime + else: + self._max_runtime = self.max_runtime + + if (self.pretrained_predictor_name is not None) and (self.length_predictor is not None): + x_pos, x_neg = self.pos_neg_to_tensor(list(self._learning_problem.kb_pos)[:self.num_examples], list(self._learning_problem.kb_neg)[:self.num_examples]) + max_length = self.predict_length(self.length_predictor, x_pos, x_neg) + self.operator.max_child_length = max_length + print(f'***** Predicted length: {max_length} *****') + else: + print('\n!!! No length predictor provided, running CLIP without length predictor !!!') + + + root = self.make_node(_concept_operand_sorter.sort(self.start_class), is_root=True) + self._add_node(root, None) + assert len(self.heuristic_queue) == 1 + # TODO:CD:suggest to add another assert,e.g. assert #. of instance in root > 1 + + self.start_time = time.time() + for j in range(1, self.iter_bound): + most_promising = self.next_node_to_expand(j) + tree_parent = self.tree_node(most_promising) + minimum_length = most_promising.h_exp + if logger.isEnabledFor(oplogging.TRACE): + logger.debug("now refining %s", most_promising) + for ref in self.downward_refinement(most_promising): + # we ignore all refinements with lower length + # (this also avoids duplicate node children) + # TODO: ignore too high depth + if ref.len < minimum_length: + # ignoring refinement, it does not satisfy minimum_length condition + continue + + # note: tree_parent has to be equal to node_tree_parent(ref.parent_node)! + added = self._add_node(ref, tree_parent) + + goal_found = added and ref.quality == 1.0 + + if goal_found and self.terminate_on_goal: + return self.terminate() + + if self.calculate_min_max: + # This is purely a statistical function, it does not influence CELOE + self.update_min_max_horiz_exp(most_promising) + + if time.time() - self.start_time > self._max_runtime: + return self.terminate() + + if self.number_of_tested_concepts >= self.max_num_of_concepts_tested: + return self.terminate() + + if logger.isEnabledFor(oplogging.TRACE) and j % 100 == 0: + self._log_current_best(j) + + return self.terminate() + + def train(self, data: Iterable[List[Tuple]], epochs=300, batch_size=256, learning_rate=1e-3, decay_rate=0.0, + clip_value=5.0, save_model=True, storage_path=None, optimizer='Adam', record_runtime=True, + example_sizes=None, shuffle_examples=False): + train_dataset = CLIPDataLoader(data, self.instance_embeddings, shuffle_examples=shuffle_examples, example_sizes=example_sizes) + train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=self.num_workers, + collate_fn=self.collate_batch, shuffle=True) + if storage_path is None: + storage_path = self.knowledge_base_path[:self.knowledge_base_path.rfind("/")] + elif not os.path.exists(storage_path): + os.mkdir(storage_path) + trainer = CLIPTrainer(self, epochs=epochs, learning_rate=learning_rate, decay_rate=decay_rate, + clip_value=clip_value, storage_path=storage_path) + trainer.train(train_dataloader, save_model, optimizer, record_runtime) class NCES(BaseNCES): @@ -1110,7 +1378,7 @@ def load_model(learner_name, load_pretrained): 0] + "trained_models/trained_" + learner_name + ".pt" model.load_state_dict(torch.load(model_path, map_location=self.device)) model.eval() - print("\n\n Loaded pretrained model! \n") + print("\n Loaded synthesizer model!") return model if not self.load_pretrained: diff --git a/ontolearn/data_struct.py b/ontolearn/data_struct.py index f956794d..84b19cba 100644 --- a/ontolearn/data_struct.py +++ b/ontolearn/data_struct.py @@ -3,6 +3,7 @@ import torch from collections import deque import pandas as pd +import numpy as np import random @@ -122,7 +123,7 @@ def clear(self): self.rewards.clear() -class BaseDataLoader: +class NCESBaseDataLoader: def __init__(self, vocab, inv_vocab): @@ -154,7 +155,7 @@ def get_labels(self, target): return labels, len(target) -class NCESDataLoader(BaseDataLoader, torch.utils.data.Dataset): +class NCESDataLoader(NCESBaseDataLoader, torch.utils.data.Dataset): def __init__(self, data: list, embeddings, vocab, inv_vocab, shuffle_examples, max_length, example_sizes=None, sorted_examples=True): @@ -190,7 +191,7 @@ def __getitem__(self, idx): self.max_length - length)]).long() -class NCESDataLoaderInference(BaseDataLoader, torch.utils.data.Dataset): +class NCESDataLoaderInference(NCESBaseDataLoader, torch.utils.data.Dataset): def __init__(self, data: list, embeddings, vocab, inv_vocab, shuffle_examples, sorted_examples=True): self.data_raw = data @@ -209,6 +210,77 @@ def __getitem__(self, idx): elif self.shuffle_examples: random.shuffle(pos) random.shuffle(neg) - datapoint_pos = torch.FloatTensor(self.embeddings.loc[pos].values) - datapoint_neg = torch.FloatTensor(self.embeddings.loc[neg].values) + datapoint_pos = torch.FloatTensor(self.embeddings.loc[pos].values.squeeze()) + datapoint_neg = torch.FloatTensor(self.embeddings.loc[neg].values.squeeze()) return datapoint_pos, datapoint_neg + + +class CLIPDataLoader(torch.utils.data.Dataset): + + def __init__(self, data: list, embeddings, shuffle_examples, example_sizes: list=None, + k=5, sorted_examples=True): + self.data_raw = data + self.embeddings = embeddings + super().__init__() + self.shuffle_examples = shuffle_examples + self.example_sizes = example_sizes + self.k = k + self.sorted_examples = sorted_examples + + def __len__(self): + return len(self.data_raw) + + def __getitem__(self, idx): + key, value = self.data_raw[idx] + pos = value['positive examples'] + neg = value['negative examples'] + length = value['length'] + if self.example_sizes is not None: + k_pos, k_neg = random.choice(self.example_sizes) + k_pos = min(k_pos, len(pos)) + k_neg = min(k_neg, len(neg)) + selected_pos = random.sample(pos, k_pos) + selected_neg = random.sample(neg, k_neg) + elif self.k is not None: + prob_pos_set = 1.0/(1+np.array(range(min(self.k, len(pos)), len(pos)+1, self.k))) + prob_pos_set = prob_pos_set/prob_pos_set.sum() + prob_neg_set = 1.0/(1+np.array(range(min(self.k, len(neg)), len(neg)+1, self.k))) + prob_neg_set = prob_neg_set/prob_neg_set.sum() + k_pos = np.random.choice(range(min(self.k, len(pos)), len(pos)+1, self.k), replace=False, p=prob_pos_set) + k_neg = np.random.choice(range(min(self.k, len(neg)), len(neg)+1, self.k), replace=False, p=prob_neg_set) + selected_pos = random.sample(pos, k_pos) + selected_neg = random.sample(neg, k_neg) + else: + selected_pos = pos + selected_neg = neg + if self.shuffle_examples: + random.shuffle(selected_pos) + random.shuffle(selected_neg) + datapoint_pos = torch.FloatTensor(self.embeddings.loc[selected_pos].values.squeeze()) + datapoint_neg = torch.FloatTensor(self.embeddings.loc[selected_neg].values.squeeze()) + return datapoint_pos, datapoint_neg, torch.LongTensor([length]) + + +class CLIPDataLoaderInference(torch.utils.data.Dataset): + + def __init__(self, data: list, embeddings, shuffle_examples, + sorted_examples=True): + self.data_raw = data + self.embeddings = embeddings + super().__init__() + self.shuffle_examples = shuffle_examples + self.sorted_examples = sorted_examples + + def __len__(self): + return len(self.data_raw) + + def __getitem__(self, idx): + _, pos, neg = self.data_raw[idx] + if self.sorted_examples: + pos, neg = sorted(pos), sorted(neg) + elif self.shuffle_examples: + random.shuffle(pos) + random.shuffle(neg) + datapoint_pos = torch.FloatTensor(self.embeddings.loc[pos].values.squeeze()) + datapoint_neg = torch.FloatTensor(self.embeddings.loc[pos].values.squeeze()) + return datapoint_pos, datapoint_neg \ No newline at end of file diff --git a/ontolearn/nces_trainer.py b/ontolearn/nces_trainer.py index 547eca4d..a57eeded 100644 --- a/ontolearn/nces_trainer.py +++ b/ontolearn/nces_trainer.py @@ -6,7 +6,7 @@ from collections import defaultdict import os import json -from ontolearn.data_struct import BaseDataLoader +from ontolearn.data_struct import NCESBaseDataLoader from torch.optim.lr_scheduler import ExponentialLR from torch.nn import functional as F from torch.nn.utils import clip_grad_value_ @@ -43,11 +43,11 @@ def soft(arg1, arg2): arg1_ = arg1 arg2_ = arg2 if isinstance(arg1_, str): - arg1_ = set(before_pad(BaseDataLoader.decompose(arg1_))) + arg1_ = set(before_pad(NCESBaseDataLoader.decompose(arg1_))) else: arg1_ = set(before_pad(arg1_)) if isinstance(arg2_, str): - arg2_ = set(before_pad(BaseDataLoader.decompose(arg2_))) + arg2_ = set(before_pad(NCESBaseDataLoader.decompose(arg2_))) else: arg2_ = set(before_pad(arg2_)) return 100*float(len(arg1_.intersection(arg2_)))/len(arg1_.union(arg2_)) @@ -56,11 +56,11 @@ def hard(arg1, arg2): arg1_ = arg1 arg2_ = arg2 if isinstance(arg1_, str): - arg1_ = before_pad(BaseDataLoader.decompose(arg1_)) + arg1_ = before_pad(NCESBaseDataLoader.decompose(arg1_)) else: arg1_ = before_pad(arg1_) if isinstance(arg2_, str): - arg2_ = before_pad(BaseDataLoader.decompose(arg2_)) + arg2_ = before_pad(NCESBaseDataLoader.decompose(arg2_)) else: arg2_ = before_pad(arg2_) return 100*float(sum(map(lambda x, y: x == y, arg1_, arg2_)))/max(len(arg1_), len(arg2_)) From 864528cb6cdcac8038781fa536e2abbf7b13819f Mon Sep 17 00:00:00 2001 From: Jean-KOUAGOU Date: Mon, 4 Mar 2024 17:21:53 +0100 Subject: [PATCH 7/8] update documentation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1c78cae1..03150771 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Learning algorithms: - **NCES2** → (soon) [Neural Class Expression Synthesis in ALCHIQ(D)](https://papers.dice-research.org/2023/ECML_NCES2/NCES2_public.pdf) - **NCES** → [Neural Class Expression Synthesis](https://link.springer.com/chapter/10.1007/978-3-031-33455-9_13) - **NERO** → (soon) [Learning Permutation-Invariant Embeddings for Description Logic Concepts](https://link.springer.com/chapter/10.1007/978-3-031-30047-9_9) -- **CLIP** → (soon) [Learning Concept Lengths Accelerates Concept Learning in ALC](https://link.springer.com/chapter/10.1007/978-3-031-06981-9_14) +- **CLIP** → [Learning Concept Lengths Accelerates Concept Learning in ALC](https://link.springer.com/chapter/10.1007/978-3-031-06981-9_14) - **CELOE** → [Class Expression Learning for Ontology Engineering](https://www.sciencedirect.com/science/article/abs/pii/S1570826811000023) - **OCEL** → A limited version of CELOE From f9f9b0932bee2b176a5bb22abce330cc894240f5 Mon Sep 17 00:00:00 2001 From: Jean-KOUAGOU Date: Mon, 4 Mar 2024 19:28:20 +0100 Subject: [PATCH 8/8] update documentation --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 03150771..f2008dd9 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ Note that F1 scores denote the quality of the find/constructed concept w.r.t. E^ ### Mutagenesis Benchmark Results ```shell -python examples/concept_learning_evaluation.py --lps LPs/Mutagenesis/lps.json --kb KGs/Mutagenesis/mutagenesis.owl --max_runtime 60 --report mutagenesis_results.csv && python -c 'import pandas as pd; print(pd.read_csv("mutagenesis_results.csv", index_col=0).to_markdown(floatfmt=".3f"))' +python examples/concept_learning_cv_evaluation.py --path_of_nces_embeddings NCESData/mutagenesis/embeddings/ConEx_entity_embeddings.csv --path_of_clip_embeddings CLIPData/mutagenesis/embeddings/ConEx_entity_embeddings.csv --folds 10 --kb KGs/Mutagenesis/mutagenesis.owl --lps LPs/Mutagenesis/lps.json --max_runtime 60 --report mutagenesis_results.csv && python -c 'import pandas as pd; print(pd.read_csv("mutagenesis_results.csv", index_col=0).to_markdown(floatfmt=".3f"))' ``` | LP | Train-F1-OCEL | Test-F1-OCEL | RT-OCEL | Train-F1-CELOE | Test-F1-CELOE | RT-CELOE | Train-F1-Evo | Test-F1-Evo | RT-Evo | Train-F1-DRILL | Test-F1-DRILL | RT-DRILL | Train-F1-TDL | Test-F1-TDL | RT-TDL | Train-F1-NCES | Test-F1-NCES | RT-NCES | Train-F1-CLIP | Test-F1-CLIP | RT-CLIP | |:---------|----------------:|---------------:|----------:|-----------------:|----------------:|-----------:|---------------:|--------------:|---------:|-----------------:|----------------:|-----------:|---------------:|--------------:|---------:|----------------:|---------------:|----------:|----------------:|---------------:|----------:| @@ -141,7 +141,7 @@ python examples/concept_learning_evaluation.py --lps LPs/Mutagenesis/lps.json -- ### Carcinogenesis Benchmark Results ```shell -python examples/concept_learning_evaluation.py --lps LPs/Carcinogenesis/lps.json --kb KGs/Carcinogenesis/carcinogenesis.owl --max_runtime 60 --report carcinogenesis_results.csv && python -c 'import pandas as pd; print(pd.read_csv("carcinogenesis_results.csv", index_col=0).to_markdown(floatfmt=".3f"))' +python examples/concept_learning_cv_evaluation.py --path_of_nces_embeddings NCESData/carcinogenesis/embeddings/ConEx_entity_embeddings.csv --path_of_clip_embeddings CLIPData/carcinogenesis/embeddings/ConEx_entity_embeddings.csv --folds 10 --kb KGs/Carcinogenesis/carcinogenesis.owl --lps LPs/Carcinogenesis/lps.json --max_runtime 60 --report carcinogenesis_results.csv && python -c 'import pandas as pd; print(pd.read_csv("carcinogenesis_results.csv", index_col=0).to_markdown(floatfmt=".3f"))' ``` | LP | Train-F1-OCEL | Test-F1-OCEL | RT-OCEL | Train-F1-CELOE | Test-F1-CELOE | RT-CELOE | Train-F1-Evo | Test-F1-Evo | RT-Evo | Train-F1-DRILL | Test-F1-DRILL | RT-DRILL | Train-F1-TDL | Test-F1-TDL | RT-TDL | Train-F1-NCES | Test-F1-NCES | RT-NCES | Train-F1-CLIP | Test-F1-CLIP | RT-CLIP |