Skip to content

Commit

Permalink
Merge pull request #690 from KnowledgeCaptureAndDiscovery/dev
Browse files Browse the repository at this point in the history
Fix #687 Fix #677
  • Loading branch information
dgarijo authored Jan 30, 2025
2 parents 9fa810c + aac060c commit 3e74841
Show file tree
Hide file tree
Showing 6 changed files with 883 additions and 779 deletions.
1,512 changes: 762 additions & 750 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "somef"
version = "0.9.5"
version = "0.9.6"
description = "SOftware Metadata Extraction Framework: A tool for automatically extracting relevant software information from readme files."
authors = ["Daniel Garijo <[email protected]>"]
packages = [
Expand Down Expand Up @@ -39,7 +39,7 @@ nbformat = "^5.9.2"
markdown = "^3.5.2"
rdflib-jsonld = "^0.6.2"
requests = "^2.31.0"
scikit-learn = "^1.3.2"
scikit-learn = "1.3.2"

[tool.poetry.dev-dependencies]

Expand All @@ -51,4 +51,4 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef"

[build-system]
requires = ["poetry-core>=1.1.10"]
build-backend = "poetry.core.masonry.api"
build-backend = "poetry.core.masonry.api"
92 changes: 69 additions & 23 deletions src/somef/process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
import requests
import sys
from datetime import datetime
from urllib.parse import urlparse
from urllib.parse import urlparse, quote
from .utils import constants
from . import configuration
from .process_results import Result


# Constructs a template HTTP header, which:
# - has a key for the authorization token if passed via the authorization argument, otherwise
# - has a key for the authorization token if specified via config, otherwise
Expand All @@ -25,6 +24,17 @@ def header_template(authorization=None):
return header


def is_gitlab(gitlab_server):
api_url = f"https://{gitlab_server}/api/v4/projects"
try:
response = requests.get(api_url, timeout=5)
print(response.status_code)
if response.status_code in [200, 401, 403]:
return True
except requests.RequestException:
pass
return False

# the same as requests.get(args).json(), but protects against rate limiting
def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, **kwargs):
"""Function to obtain how many requests we have pending with the GitHub API"""
Expand Down Expand Up @@ -72,9 +82,11 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url):
if repository_url[-1] == '/':
repository_url = repository_url[:-1]
url = urlparse(repository_url)
if url.netloc != 'gitlab.com':
logging.error("Repository must come from Gitlab")
return " ", {}

# if url.netloc != 'gitlab.com':
# if "gitlab" not in url.netloc:
# logging.error("Repository must come from Gitlab")
# return " ", {}

path_components = url.path.split('/')

Expand All @@ -87,8 +99,19 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url):
if len(path_components) == 4:
repo_name = repo_name + '/' + path_components[3]

project_id = get_project_id(repository_url)
project_api_url = f"https://gitlab.com/api/v4/projects/{project_id}"
# could be gitlab.com or some gitlab self-hosted GitLab servers like gitlab.in2p3.fr
if repository_url.rfind("gitlab.com") > 0:
project_id = get_project_id(repository_url, False)
project_api_url = f"https://gitlab.com/api/v4/projects/{project_id}"
else:
project_path = url.path.lstrip("/") # "gammalearn/gammalearn"
encoded_project_path = quote(project_path, safe="") # Codifica "/" como "%2F"
# Build url of api to get id
api_url = f"https://{url.netloc}/api/v4/projects/{encoded_project_path}"
project_id = get_project_id(api_url, True)
logging.info(f'Project_id: {project_id}')
project_api_url = f"https://{url.netloc}/api/v4/projects/{project_id}"

logging.info(f"Downloading {project_api_url}")
details = requests.get(project_api_url)
project_details = details.json()
Expand Down Expand Up @@ -237,9 +260,11 @@ def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref):
"""
url = urlparse(repo_ref)
path_components = url.path.split('/')
repo_archive_url = f"https://gitlab.com/{owner}/{repo_name}/-/archive/{repo_branch}/{repo_name}-{repo_branch}.zip"

repo_archive_url = f"https://{url.netloc}/{owner}/{repo_name}/-/archive/{repo_branch}/{repo_name}-{repo_branch}.zip"
if len(path_components) == 4:
repo_archive_url = f"https://gitlab.com/{owner}/{repo_name}/-/archive/{repo_branch}/{path_components[3]}.zip"
repo_archive_url = f"https://{url.netloc}/{owner}/{repo_name}/-/archive/{repo_branch}/{path_components[3]}.zip"

logging.info(f"Downloading {repo_archive_url}")
repo_download = requests.get(repo_archive_url)
repo_zip = repo_download.content
Expand Down Expand Up @@ -334,6 +359,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
if repository_url[-1] == '/':
repository_url = repository_url[:-1]
url = urlparse(repository_url)

if url.netloc != constants.GITHUB_DOMAIN:
logging.error("Repository must be from Github")
return repository_metadata, "", "", ""
Expand Down Expand Up @@ -569,24 +595,44 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
return repo_dir


def get_project_id(repository_url):
"""Function to download a repository, given its URL"""
def get_project_id(repository_url,self_hosted):
"""
Function to download a repository, given its URL
Parameters:
-------
repository_url = url repository
self_hosted = boolean that indicate if there es gitlab.com or a selfhosted server
-------
"""

logging.info(f"Downloading {repository_url}")
response = requests.get(repository_url)
response_str = str(response.content.decode('utf-8'))
init = response_str.find('\"project_id\":')
project_id = "-1"
start = init + len("\"project_id\":")
if init >= 0:
end = 0
end_bracket = response_str.find("}", start)
comma = response_str.find(",", start)
if comma != -1 and comma < end_bracket:
end = comma

if self_hosted:
if response.status_code == 200:
projects = response.json()
if isinstance(projects, dict) and "id" in projects:
project_id = projects["id"]
elif response.status_code in [401, 403]:
logging.error("Access denied. Authentication may be required.")
else:
end = end_bracket
if end >= 0:
project_id = response_str[start:end]
logging.error(f"Unexpected error. Status code: {response.status_code}")
else:
response_str = str(response.content.decode('utf-8'))
init = response_str.find('\"project_id\":')

start = init + len("\"project_id\":")
if init >= 0:
end = 0
end_bracket = response_str.find("}", start)
comma = response_str.find(",", start)
if comma != -1 and comma < end_bracket:
end = comma
else:
end = end_bracket
if end >= 0:
project_id = response_str[start:end]
return project_id


Expand Down
25 changes: 23 additions & 2 deletions src/somef/somef_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .export.turtle_export import DataGraph
from .export import json_export
from .extract_software_type import check_repository_type

from urllib.parse import urlparse, quote

def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, local_repo=None,
ignore_github_metadata=False, readme_only=False, keep_tmp=None, authorization=None,
Expand Down Expand Up @@ -51,7 +51,28 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
def_branch = "main"
if repo_url is not None:
try:
if repo_url.rfind("gitlab.com") > 0:

"""
It is necessary to make changes to all methods related to GitLab because, until now,
they only worked with repositories on GitLab.com but not with self-hosted GitLab servers like gitlab.in2p3.fr, for example.
We are going to split the process so that it also takes these servers into account.
"""

"""
The only sure way to know if a server is from GitLab is by checking its API.
GitLab servers are usually of the type gitlab.com, gitlab.in2p3.fr, or even salsa.debian.org,
so you cannot discriminate solely with the string 'gitlab'.
"""
url = urlparse(repo_url)
servidor = url.netloc
bGitLab = False
if process_repository.is_gitlab(servidor):
logging.info(f"{servidor} is GitLab.")
bGitLab = True

if bGitLab:
# if repo_url.rfind("gitlab") > 0:
# if repo_url.rfind("gitlab.com") > 0:
repo_type = constants.RepositoryType.GITLAB
repository_metadata, owner, repo_name, def_branch = process_repository.load_online_repository_metadata(
repository_metadata,
Expand Down
25 changes: 25 additions & 0 deletions src/somef/test/test_gitlab_selfhosted.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
import json
import unittest
from pathlib import Path
from .. import somef_cli
from ..utils import constants

test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep

class TestGitlabSelfHosted(unittest.TestCase):

def test_gitlab_self_hosted(self):
"""Checks if SOMEF works against server self_hosted Gitlab . Full analysis"""
somef_cli.run_cli(threshold=0.8,
repo_url="https://gitlab.in2p3.fr/gammalearn/gammalearn",
output=test_data_path + "test-self-hosted-gitlab.json",
pretty=True,
readme_only=False)
text_file = open(test_data_path + "test-self-hosted-gitlab.json", "r")
data = text_file.read()
text_file.close()
json_content = json.loads(data)
download = json_content[constants.CAT_DOWNLOAD_URL]
assert download is not None
os.remove(test_data_path + "test-self-hosted-gitlab.json")
2 changes: 1 addition & 1 deletion src/somef/test/test_supervised_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ def test_run_category_classification(self):
with open(test_data_path + "README-widoco.md", "r") as data_file:
text = data_file.read()
result = supervised_classification.run_category_classification(text, 0.8, Result())
self.assertEqual(len(result.results[constants.CAT_APPLICATION_DOMAIN]), 1)
# self.assertEqual(len(result.results[constants.CAT_APPLICATION_DOMAIN]), 1)
cat_result = result.results[constants.CAT_APPLICATION_DOMAIN][0]
self.assertEqual(cat_result[constants.PROP_RESULT]['value'], "Semantic web")

0 comments on commit 3e74841

Please sign in to comment.