Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions fsspec/implementations/tests/test_webhdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import shlex
import subprocess
import time
from datetime import datetime

import pytest

Expand Down Expand Up @@ -208,3 +209,87 @@ def test_protocol_prefixed_path(hdfs_cluster):

file_info = fs.ls(protocol_prefixed_path, detail=True)
assert len(file_info) == 0


def test_modified_nonexistent_path(hdfs_cluster):
fs = WebHDFS(
hdfs_cluster,
user="testuser",
data_proxy={"worker.example.com": "localhost"},
)
nonexistent_path = "/user/testuser/nonexistent_file.txt"

with pytest.raises(FileNotFoundError):
fs.modified(nonexistent_path)


def test_modified_time(hdfs_cluster):
fs = WebHDFS(
hdfs_cluster,
user="testuser",
data_proxy={"worker.example.com": "localhost"},
)
dir_path = "/user/testuser/"
file_path = f"{dir_path}/testfile.txt"

fs.mkdir(dir_path)

# Check first modified time for directories
modified_dir_date: datetime = fs.modified(dir_path)

# I think it is the only thing we can assume, but I'm not sure if the server has a different time
assert modified_dir_date <= datetime.now()

# Create a file and check modified time again
with fs.open(file_path, "wb") as f:
f.write(b"test content")

modified_file_date: datetime = fs.modified(file_path)
assert modified_file_date >= modified_dir_date
assert modified_file_date <= datetime.now()


# NOTE: These following two tests are a copy of the modified ones, as
# WebHDFS does not have a created time API, we are using modified as a proxy.


def test_created_nonexistent_path(hdfs_cluster):
fs = WebHDFS(
hdfs_cluster,
user="testuser",
data_proxy={"worker.example.com": "localhost"},
)
nonexistent_path = "/user/testuser/nonexistent_file.txt"

with pytest.raises(FileNotFoundError):
fs.created(nonexistent_path)


def test_created_time(hdfs_cluster):
fs = WebHDFS(
hdfs_cluster,
user="testuser",
data_proxy={"worker.example.com": "localhost"},
)
dir_path = "/user/testuser/"
file_path = f"{dir_path}/testfile.txt"

fs.mkdir(dir_path)

time.sleep(1)

# Check first created time for directories
created_dir_date: datetime = fs.created(dir_path)

# I think it is the only thing we can assume, but I'm not sure if the server has a different time
assert created_dir_date < datetime.now()

# Create a file and check created time again
with fs.open(file_path, "wb") as f:
f.write(b"test content")

time.sleep(1)

created_file_date: datetime = fs.created(file_path)
assert created_file_date > created_dir_date
assert created_file_date < datetime.now()
18 changes: 18 additions & 0 deletions fsspec/implementations/webhdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import tempfile
import uuid
from contextlib import suppress
from datetime import datetime
from urllib.parse import quote

import requests
Expand Down Expand Up @@ -268,6 +269,23 @@ def info(self, path):
info["name"] = path
return self._process_info(info)

def created(self, path):
"""Return the created timestamp of a file as a datetime.datetime"""
# The API does not provide creation time, so we use modification time
info = self.info(path)
mtime = info.get("modificationTime", None)
if mtime is not None:
return datetime.fromtimestamp(mtime / 1000)
raise RuntimeError("Could not retrieve creation time (modification time).")

def modified(self, path):
"""Return the modified timestamp of a file as a datetime.datetime"""
info = self.info(path)
mtime = info.get("modificationTime", None)
if mtime is not None:
return datetime.fromtimestamp(mtime / 1000)
raise RuntimeError("Could not retrieve modification time.")

def ls(self, path, detail=False, **kwargs):
out = self._call("LISTSTATUS", path=path)
infos = out.json()["FileStatuses"]["FileStatus"]
Expand Down
Loading