rework github _open() implementation to support LFS (#1810)

thomasgilgenast · martindurant · web-flow · commit 961412dc83cb · 2025-03-17T17:48:25.000-04:00
Co-authored-by: Martin Durant &lt;martin.durant@alumni.utoronto.ca&gt;
diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py
@@ -1,6 +1,6 @@
-import requests
+import base64
 
-import fsspec
+import requests
 
 from ..spec import AbstractFileSystem
 from ..utils import infer_storage_options
@@ -16,8 +16,10 @@ class GithubFileSystem(AbstractFileSystem):
     repository. You may specify a point in the repos history, by SHA, branch
     or tag (default is current master).
 
-    Given that code files tend to be small, and that github does not support
-    retrieving partial content, we always fetch whole files.
+    For files less than 1 MB in size, file content is returned directly in a
+    MemoryFile. For larger files, or for files tracked by git-lfs, file content
+    is returned as an HTTPFile wrapping the ``download_url`` provided by the
+    GitHub API.
 
     When using fsspec.open, allows URIs of the form:
 
@@ -36,7 +38,7 @@ class GithubFileSystem(AbstractFileSystem):
     """
 
     url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
-    rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
+    content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
     protocol = "github"
     timeout = (60, 60)  # connect, read timeouts
 
@@ -63,6 +65,12 @@ def __init__(
 
         self.root = sha
         self.ls("")
+        try:
+            from .http import HTTPFileSystem
+
+            self.http_fs = HTTPFileSystem(**kwargs)
+        except ImportError:
+            self.http_fs = None
 
     @property
     def kw(self):
@@ -212,28 +220,48 @@ def _open(
         path,
         mode="rb",
         block_size=None,
-        autocommit=True,
         cache_options=None,
         sha=None,
         **kwargs,
     ):
         if mode != "rb":
             raise NotImplementedError
-        url = self.rurl.format(
+
+        # construct a url to hit the GitHub API's repo contents API
+        url = self.content_url.format(
             org=self.org, repo=self.repo, path=path, sha=sha or self.root
         )
+
+        # make a request to this API, and parse the response as JSON
         r = requests.get(url, timeout=self.timeout, **self.kw)
         if r.status_code == 404:
             raise FileNotFoundError(path)
         r.raise_for_status()
-        return MemoryFile(None, None, r.content)
-
-    def cat(self, path, recursive=False, on_error="raise", **kwargs):
-        paths = self.expand_path(path, recursive=recursive)
-        urls = [
-            self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
-            for u, sh in paths
-        ]
-        fs = fsspec.filesystem("http")
-        data = fs.cat(urls, on_error="return")
-        return {u: v for ((k, v), u) in zip(data.items(), urls)}
+        content_json = r.json()
+
+        # if the response's content key is not empty, try to parse it as base64
+        if content_json["content"]:
+            content = base64.b64decode(content_json["content"])
+
+            # as long as the content does not start with the string
+            # "version https://git-lfs.github.com/"
+            # then it is probably not a git-lfs pointer and we can just return
+            # the content directly
+            if not content.startswith(b"version https://git-lfs.github.com/"):
+                return MemoryFile(None, None, content)
+
+        # we land here if the content was not present in the first response
+        # (regular file over 1MB or git-lfs tracked file)
+        # in this case, we get let the HTTPFileSystem handle the download
+        if self.http_fs is None:
+            raise ImportError(
+                "Please install fsspec[http] to access github files >1 MB "
+                "or git-lfs tracked files."
+            )
+        return self.http_fs.open(
+            content_json["download_url"],
+            mode=mode,
+            block_size=block_size,
+            cache_options=cache_options,
+            **kwargs,
+        )
diff --git a/fsspec/implementations/tests/test_github.py b/fsspec/implementations/tests/test_github.py
@@ -0,0 +1,48 @@
+import fsspec
+
+
+def test_github_open_small_file():
+    # test opening a small file <1 MB
+    with fsspec.open("github://mwaskom:seaborn-data@4e06bf0/penguins.csv") as f:
+        assert f.readline().startswith(b"species,island")
+
+
+def test_github_open_large_file():
+    # test opening a large file >1 MB
+    # use block_size=0 to get a streaming interface to the file, ensuring that
+    # we fetch only the parts we need instead of downloading the full file all
+    # at once
+    with fsspec.open(
+        "github://mwaskom:seaborn-data@83bfba7/brain_networks.csv", block_size=0
+    ) as f:
+        # read only the first 20 bytes of the file
+        assert f.read(20) == b"network,1,1,2,2,3,3,"
+
+
+def test_github_open_lfs_file():
+    # test opening a git-lfs tracked file
+    with fsspec.open(
+        "github://cBioPortal:datahub@55cd360"
+        "/public/acc_2019/data_gene_panel_matrix.txt",
+        block_size=0,
+    ) as f:
+        assert f.read(19) == b"SAMPLE_ID\tmutations"
+
+
+def test_github_cat():
+    # test using cat to fetch the content of multiple files
+    fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
+    paths = ["penguins.csv", "mpg.csv"]
+    cat_result = fs.cat(paths)
+    assert set(cat_result.keys()) == {"penguins.csv", "mpg.csv"}
+    assert cat_result["penguins.csv"].startswith(b"species,island")
+    assert cat_result["mpg.csv"].startswith(b"mpg,cylinders")
+
+
+def test_github_ls():
+    # test using ls to list the files in a resository
+    fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
+    ls_result = set(fs.ls(""))
+    expected = {"brain_networks.csv", "mpg.csv", "penguins.csv", "README.md", "raw"}
+    # check if the result is a subset of the expected files
+    assert expected.issubset(ls_result)