Skip to content

Commit 961412d

Browse files
rework github _open() implementation to support LFS (#1810)
Co-authored-by: Martin Durant <[email protected]>
1 parent 6b85a47 commit 961412d

File tree

2 files changed

+94
-18
lines changed

2 files changed

+94
-18
lines changed

fsspec/implementations/github.py

+46-18
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
import requests
1+
import base64
22

3-
import fsspec
3+
import requests
44

55
from ..spec import AbstractFileSystem
66
from ..utils import infer_storage_options
@@ -16,8 +16,10 @@ class GithubFileSystem(AbstractFileSystem):
1616
repository. You may specify a point in the repos history, by SHA, branch
1717
or tag (default is current master).
1818
19-
Given that code files tend to be small, and that github does not support
20-
retrieving partial content, we always fetch whole files.
19+
For files less than 1 MB in size, file content is returned directly in a
20+
MemoryFile. For larger files, or for files tracked by git-lfs, file content
21+
is returned as an HTTPFile wrapping the ``download_url`` provided by the
22+
GitHub API.
2123
2224
When using fsspec.open, allows URIs of the form:
2325
@@ -36,7 +38,7 @@ class GithubFileSystem(AbstractFileSystem):
3638
"""
3739

3840
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
39-
rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
41+
content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
4042
protocol = "github"
4143
timeout = (60, 60) # connect, read timeouts
4244

@@ -63,6 +65,12 @@ def __init__(
6365

6466
self.root = sha
6567
self.ls("")
68+
try:
69+
from .http import HTTPFileSystem
70+
71+
self.http_fs = HTTPFileSystem(**kwargs)
72+
except ImportError:
73+
self.http_fs = None
6674

6775
@property
6876
def kw(self):
@@ -212,28 +220,48 @@ def _open(
212220
path,
213221
mode="rb",
214222
block_size=None,
215-
autocommit=True,
216223
cache_options=None,
217224
sha=None,
218225
**kwargs,
219226
):
220227
if mode != "rb":
221228
raise NotImplementedError
222-
url = self.rurl.format(
229+
230+
# construct a url to hit the GitHub API's repo contents API
231+
url = self.content_url.format(
223232
org=self.org, repo=self.repo, path=path, sha=sha or self.root
224233
)
234+
235+
# make a request to this API, and parse the response as JSON
225236
r = requests.get(url, timeout=self.timeout, **self.kw)
226237
if r.status_code == 404:
227238
raise FileNotFoundError(path)
228239
r.raise_for_status()
229-
return MemoryFile(None, None, r.content)
230-
231-
def cat(self, path, recursive=False, on_error="raise", **kwargs):
232-
paths = self.expand_path(path, recursive=recursive)
233-
urls = [
234-
self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
235-
for u, sh in paths
236-
]
237-
fs = fsspec.filesystem("http")
238-
data = fs.cat(urls, on_error="return")
239-
return {u: v for ((k, v), u) in zip(data.items(), urls)}
240+
content_json = r.json()
241+
242+
# if the response's content key is not empty, try to parse it as base64
243+
if content_json["content"]:
244+
content = base64.b64decode(content_json["content"])
245+
246+
# as long as the content does not start with the string
247+
# "version https://git-lfs.github.com/"
248+
# then it is probably not a git-lfs pointer and we can just return
249+
# the content directly
250+
if not content.startswith(b"version https://git-lfs.github.com/"):
251+
return MemoryFile(None, None, content)
252+
253+
# we land here if the content was not present in the first response
254+
# (regular file over 1MB or git-lfs tracked file)
255+
# in this case, we get let the HTTPFileSystem handle the download
256+
if self.http_fs is None:
257+
raise ImportError(
258+
"Please install fsspec[http] to access github files >1 MB "
259+
"or git-lfs tracked files."
260+
)
261+
return self.http_fs.open(
262+
content_json["download_url"],
263+
mode=mode,
264+
block_size=block_size,
265+
cache_options=cache_options,
266+
**kwargs,
267+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import fsspec
2+
3+
4+
def test_github_open_small_file():
5+
# test opening a small file <1 MB
6+
with fsspec.open("github://mwaskom:seaborn-data@4e06bf0/penguins.csv") as f:
7+
assert f.readline().startswith(b"species,island")
8+
9+
10+
def test_github_open_large_file():
11+
# test opening a large file >1 MB
12+
# use block_size=0 to get a streaming interface to the file, ensuring that
13+
# we fetch only the parts we need instead of downloading the full file all
14+
# at once
15+
with fsspec.open(
16+
"github://mwaskom:seaborn-data@83bfba7/brain_networks.csv", block_size=0
17+
) as f:
18+
# read only the first 20 bytes of the file
19+
assert f.read(20) == b"network,1,1,2,2,3,3,"
20+
21+
22+
def test_github_open_lfs_file():
23+
# test opening a git-lfs tracked file
24+
with fsspec.open(
25+
"github://cBioPortal:datahub@55cd360"
26+
"/public/acc_2019/data_gene_panel_matrix.txt",
27+
block_size=0,
28+
) as f:
29+
assert f.read(19) == b"SAMPLE_ID\tmutations"
30+
31+
32+
def test_github_cat():
33+
# test using cat to fetch the content of multiple files
34+
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
35+
paths = ["penguins.csv", "mpg.csv"]
36+
cat_result = fs.cat(paths)
37+
assert set(cat_result.keys()) == {"penguins.csv", "mpg.csv"}
38+
assert cat_result["penguins.csv"].startswith(b"species,island")
39+
assert cat_result["mpg.csv"].startswith(b"mpg,cylinders")
40+
41+
42+
def test_github_ls():
43+
# test using ls to list the files in a resository
44+
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
45+
ls_result = set(fs.ls(""))
46+
expected = {"brain_networks.csv", "mpg.csv", "penguins.csv", "README.md", "raw"}
47+
# check if the result is a subset of the expected files
48+
assert expected.issubset(ls_result)

0 commit comments

Comments
 (0)