1
- import requests
1
+ import base64
2
2
3
- import fsspec
3
+ import requests
4
4
5
5
from ..spec import AbstractFileSystem
6
6
from ..utils import infer_storage_options
@@ -16,8 +16,10 @@ class GithubFileSystem(AbstractFileSystem):
16
16
repository. You may specify a point in the repos history, by SHA, branch
17
17
or tag (default is current master).
18
18
19
- Given that code files tend to be small, and that github does not support
20
- retrieving partial content, we always fetch whole files.
19
+ For files less than 1 MB in size, file content is returned directly in a
20
+ MemoryFile. For larger files, or for files tracked by git-lfs, file content
21
+ is returned as an HTTPFile wrapping the ``download_url`` provided by the
22
+ GitHub API.
21
23
22
24
When using fsspec.open, allows URIs of the form:
23
25
@@ -36,7 +38,7 @@ class GithubFileSystem(AbstractFileSystem):
36
38
"""
37
39
38
40
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
39
- rurl = "https://raw.githubusercontent .com/{org}/{repo}/{sha} /{path}"
41
+ content_url = "https://api.github .com/repos/ {org}/{repo}/contents /{path}?ref={sha }"
40
42
protocol = "github"
41
43
timeout = (60 , 60 ) # connect, read timeouts
42
44
@@ -63,6 +65,12 @@ def __init__(
63
65
64
66
self .root = sha
65
67
self .ls ("" )
68
+ try :
69
+ from .http import HTTPFileSystem
70
+
71
+ self .http_fs = HTTPFileSystem (** kwargs )
72
+ except ImportError :
73
+ self .http_fs = None
66
74
67
75
@property
68
76
def kw (self ):
@@ -212,28 +220,48 @@ def _open(
212
220
path ,
213
221
mode = "rb" ,
214
222
block_size = None ,
215
- autocommit = True ,
216
223
cache_options = None ,
217
224
sha = None ,
218
225
** kwargs ,
219
226
):
220
227
if mode != "rb" :
221
228
raise NotImplementedError
222
- url = self .rurl .format (
229
+
230
+ # construct a url to hit the GitHub API's repo contents API
231
+ url = self .content_url .format (
223
232
org = self .org , repo = self .repo , path = path , sha = sha or self .root
224
233
)
234
+
235
+ # make a request to this API, and parse the response as JSON
225
236
r = requests .get (url , timeout = self .timeout , ** self .kw )
226
237
if r .status_code == 404 :
227
238
raise FileNotFoundError (path )
228
239
r .raise_for_status ()
229
- return MemoryFile (None , None , r .content )
230
-
231
- def cat (self , path , recursive = False , on_error = "raise" , ** kwargs ):
232
- paths = self .expand_path (path , recursive = recursive )
233
- urls = [
234
- self .rurl .format (org = self .org , repo = self .repo , path = u , sha = self .root )
235
- for u , sh in paths
236
- ]
237
- fs = fsspec .filesystem ("http" )
238
- data = fs .cat (urls , on_error = "return" )
239
- return {u : v for ((k , v ), u ) in zip (data .items (), urls )}
240
+ content_json = r .json ()
241
+
242
+ # if the response's content key is not empty, try to parse it as base64
243
+ if content_json ["content" ]:
244
+ content = base64 .b64decode (content_json ["content" ])
245
+
246
+ # as long as the content does not start with the string
247
+ # "version https://git-lfs.github.com/"
248
+ # then it is probably not a git-lfs pointer and we can just return
249
+ # the content directly
250
+ if not content .startswith (b"version https://git-lfs.github.com/" ):
251
+ return MemoryFile (None , None , content )
252
+
253
+ # we land here if the content was not present in the first response
254
+ # (regular file over 1MB or git-lfs tracked file)
255
+ # in this case, we get let the HTTPFileSystem handle the download
256
+ if self .http_fs is None :
257
+ raise ImportError (
258
+ "Please install fsspec[http] to access github files >1 MB "
259
+ "or git-lfs tracked files."
260
+ )
261
+ return self .http_fs .open (
262
+ content_json ["download_url" ],
263
+ mode = mode ,
264
+ block_size = block_size ,
265
+ cache_options = cache_options ,
266
+ ** kwargs ,
267
+ )
0 commit comments