@@ -163,6 +163,58 @@ FileHandle::FileHandle(const std::string& file_path,
163163 }
164164}
165165
166+ FileHandle::FileHandle (FileHandle&& o) noexcept
167+ : _fd_direct_on{std::exchange (o._fd_direct_on , -1 )},
168+ _fd_direct_off{std::exchange (o._fd_direct_off , -1 )},
169+ _initialized{std::exchange (o._initialized , false )},
170+ _compat_mode{std::exchange (o._compat_mode , CompatMode::AUTO)},
171+ _nbytes{std::exchange (o._nbytes , 0 )},
172+ _handle{std::exchange (o._handle , CUfileHandle_t{})}
173+ {
174+ }
175+
176+ FileHandle& FileHandle::operator =(FileHandle&& o) noexcept
177+ {
178+ _fd_direct_on = std::exchange (o._fd_direct_on , -1 );
179+ _fd_direct_off = std::exchange (o._fd_direct_off , -1 );
180+ _initialized = std::exchange (o._initialized , false );
181+ _compat_mode = std::exchange (o._compat_mode , CompatMode::AUTO);
182+ _nbytes = std::exchange (o._nbytes , 0 );
183+ _handle = std::exchange (o._handle , CUfileHandle_t{});
184+ return *this ;
185+ }
186+
187+ FileHandle::~FileHandle () noexcept { close (); }
188+
189+ bool FileHandle::closed () const noexcept { return !_initialized; }
190+
191+ void FileHandle::close () noexcept
192+ {
193+ try {
194+ if (closed ()) { return ; }
195+
196+ if (!is_compat_mode_preferred ()) { cuFileAPI::instance ().HandleDeregister (_handle); }
197+ _compat_mode = CompatMode::AUTO;
198+ ::close (_fd_direct_off);
199+ if (_fd_direct_on != -1 ) { ::close (_fd_direct_on); }
200+ _fd_direct_on = -1 ;
201+ _fd_direct_off = -1 ;
202+ _initialized = false ;
203+ } catch (...) {
204+ }
205+ }
206+
207+ CUfileHandle_t FileHandle::handle ()
208+ {
209+ if (closed ()) { throw CUfileException (" File handle is closed" ); }
210+ if (is_compat_mode_preferred ()) {
211+ throw CUfileException (" The underlying cuFile handle isn't available in compatibility mode" );
212+ }
213+ return _handle;
214+ }
215+
216+ int FileHandle::fd () const noexcept { return _fd_direct_off; }
217+
166218int FileHandle::fd_open_flags () const { return open_flags (_fd_direct_off); }
167219
168220std::size_t FileHandle::nbytes () const
@@ -172,4 +224,232 @@ std::size_t FileHandle::nbytes() const
172224 return _nbytes;
173225}
174226
227+ std::size_t FileHandle::read (void * devPtr_base,
228+ std::size_t size,
229+ std::size_t file_offset,
230+ std::size_t devPtr_offset,
231+ bool sync_default_stream)
232+ {
233+ if (is_compat_mode_preferred ()) {
234+ return detail::posix_device_read (_fd_direct_off, devPtr_base, size, file_offset, devPtr_offset);
235+ }
236+ if (sync_default_stream) { CUDA_DRIVER_TRY (cudaAPI::instance ().StreamSynchronize (nullptr )); }
237+
238+ KVIKIO_NVTX_SCOPED_RANGE (" cufileRead()" , size);
239+ ssize_t ret = cuFileAPI::instance ().Read (
240+ _handle, devPtr_base, size, convert_size2off (file_offset), convert_size2off (devPtr_offset));
241+ CUFILE_CHECK_BYTES_DONE (ret);
242+ return ret;
243+ }
244+
245+ std::size_t FileHandle::write (const void * devPtr_base,
246+ std::size_t size,
247+ std::size_t file_offset,
248+ std::size_t devPtr_offset,
249+ bool sync_default_stream)
250+ {
251+ _nbytes = 0 ; // Invalidate the computed file size
252+
253+ if (is_compat_mode_preferred ()) {
254+ return detail::posix_device_write (
255+ _fd_direct_off, devPtr_base, size, file_offset, devPtr_offset);
256+ }
257+ if (sync_default_stream) { CUDA_DRIVER_TRY (cudaAPI::instance ().StreamSynchronize (nullptr )); }
258+
259+ KVIKIO_NVTX_SCOPED_RANGE (" cufileWrite()" , size);
260+ ssize_t ret = cuFileAPI::instance ().Write (
261+ _handle, devPtr_base, size, convert_size2off (file_offset), convert_size2off (devPtr_offset));
262+ if (ret == -1 ) {
263+ throw std::system_error (errno, std::generic_category (), " Unable to write file" );
264+ }
265+ if (ret < -1 ) {
266+ throw CUfileException (std::string{" cuFile error at: " } + __FILE__ + " :" +
267+ KVIKIO_STRINGIFY (__LINE__) + " : " + CUFILE_ERRSTR (ret));
268+ }
269+ return ret;
270+ }
271+
272+ std::future<std::size_t > FileHandle::pread (void * buf,
273+ std::size_t size,
274+ std::size_t file_offset,
275+ std::size_t task_size,
276+ std::size_t gds_threshold,
277+ bool sync_default_stream)
278+ {
279+ KVIKIO_NVTX_MARKER (" FileHandle::pread()" , size);
280+ if (is_host_memory (buf)) {
281+ auto op = [this ](void * hostPtr_base,
282+ std::size_t size,
283+ std::size_t file_offset,
284+ std::size_t hostPtr_offset) -> std::size_t {
285+ char * buf = static_cast <char *>(hostPtr_base) + hostPtr_offset;
286+ return detail::posix_host_read<detail::PartialIO::NO>(_fd_direct_off, buf, size, file_offset);
287+ };
288+
289+ return parallel_io (op, buf, size, file_offset, task_size, 0 );
290+ }
291+
292+ CUcontext ctx = get_context_from_pointer (buf);
293+
294+ // Shortcut that circumvent the threadpool and use the POSIX backend directly.
295+ if (size < gds_threshold) {
296+ auto task = [this , ctx, buf, size, file_offset]() -> std::size_t {
297+ PushAndPopContext c (ctx);
298+ return detail::posix_device_read (_fd_direct_off, buf, size, file_offset, 0 );
299+ };
300+ return std::async (std::launch::deferred, task);
301+ }
302+
303+ // Let's synchronize once instead of in each task.
304+ if (sync_default_stream && !is_compat_mode_preferred ()) {
305+ PushAndPopContext c (ctx);
306+ CUDA_DRIVER_TRY (cudaAPI::instance ().StreamSynchronize (nullptr ));
307+ }
308+
309+ // Regular case that use the threadpool and run the tasks in parallel
310+ auto task = [this , ctx](void * devPtr_base,
311+ std::size_t size,
312+ std::size_t file_offset,
313+ std::size_t devPtr_offset) -> std::size_t {
314+ PushAndPopContext c (ctx);
315+ return read (devPtr_base, size, file_offset, devPtr_offset, /* sync_default_stream = */ false );
316+ };
317+ auto [devPtr_base, base_size, devPtr_offset] = get_alloc_info (buf, &ctx);
318+ return parallel_io (task, devPtr_base, size, file_offset, task_size, devPtr_offset);
319+ }
320+
321+ std::future<std::size_t > FileHandle::pwrite (const void * buf,
322+ std::size_t size,
323+ std::size_t file_offset,
324+ std::size_t task_size,
325+ std::size_t gds_threshold,
326+ bool sync_default_stream)
327+ {
328+ KVIKIO_NVTX_MARKER (" FileHandle::pwrite()" , size);
329+ if (is_host_memory (buf)) {
330+ auto op = [this ](const void * hostPtr_base,
331+ std::size_t size,
332+ std::size_t file_offset,
333+ std::size_t hostPtr_offset) -> std::size_t {
334+ const char * buf = static_cast <const char *>(hostPtr_base) + hostPtr_offset;
335+ return detail::posix_host_write<detail::PartialIO::NO>(
336+ _fd_direct_off, buf, size, file_offset);
337+ };
338+
339+ return parallel_io (op, buf, size, file_offset, task_size, 0 );
340+ }
341+
342+ CUcontext ctx = get_context_from_pointer (buf);
343+
344+ // Shortcut that circumvent the threadpool and use the POSIX backend directly.
345+ if (size < gds_threshold) {
346+ auto task = [this , ctx, buf, size, file_offset]() -> std::size_t {
347+ PushAndPopContext c (ctx);
348+ return detail::posix_device_write (_fd_direct_off, buf, size, file_offset, 0 );
349+ };
350+ return std::async (std::launch::deferred, task);
351+ }
352+
353+ // Let's synchronize once instead of in each task.
354+ if (sync_default_stream && !is_compat_mode_preferred ()) {
355+ PushAndPopContext c (ctx);
356+ CUDA_DRIVER_TRY (cudaAPI::instance ().StreamSynchronize (nullptr ));
357+ }
358+
359+ // Regular case that use the threadpool and run the tasks in parallel
360+ auto op = [this , ctx](const void * devPtr_base,
361+ std::size_t size,
362+ std::size_t file_offset,
363+ std::size_t devPtr_offset) -> std::size_t {
364+ PushAndPopContext c (ctx);
365+ return write (devPtr_base, size, file_offset, devPtr_offset, /* sync_default_stream = */ false );
366+ };
367+ auto [devPtr_base, base_size, devPtr_offset] = get_alloc_info (buf, &ctx);
368+ return parallel_io (op, devPtr_base, size, file_offset, task_size, devPtr_offset);
369+ }
370+
371+ void FileHandle::read_async (void * devPtr_base,
372+ std::size_t * size_p,
373+ off_t * file_offset_p,
374+ off_t * devPtr_offset_p,
375+ ssize_t * bytes_read_p,
376+ CUstream stream)
377+ {
378+ if (is_compat_mode_preferred_for_async (_compat_mode)) {
379+ CUDA_DRIVER_TRY (cudaAPI::instance ().StreamSynchronize (stream));
380+ *bytes_read_p =
381+ static_cast <ssize_t >(read (devPtr_base, *size_p, *file_offset_p, *devPtr_offset_p));
382+ } else {
383+ CUFILE_TRY (cuFileAPI::instance ().ReadAsync (
384+ _handle, devPtr_base, size_p, file_offset_p, devPtr_offset_p, bytes_read_p, stream));
385+ }
386+ }
387+
388+ StreamFuture FileHandle::read_async (
389+ void * devPtr_base, std::size_t size, off_t file_offset, off_t devPtr_offset, CUstream stream)
390+ {
391+ StreamFuture ret (devPtr_base, size, file_offset, devPtr_offset, stream);
392+ auto [devPtr_base_, size_p, file_offset_p, devPtr_offset_p, bytes_read_p, stream_] =
393+ ret.get_args ();
394+ read_async (devPtr_base_, size_p, file_offset_p, devPtr_offset_p, bytes_read_p, stream_);
395+ return ret;
396+ }
397+
398+ void FileHandle::write_async (void * devPtr_base,
399+ std::size_t * size_p,
400+ off_t * file_offset_p,
401+ off_t * devPtr_offset_p,
402+ ssize_t * bytes_written_p,
403+ CUstream stream)
404+ {
405+ if (is_compat_mode_preferred_for_async (_compat_mode)) {
406+ CUDA_DRIVER_TRY (cudaAPI::instance ().StreamSynchronize (stream));
407+ *bytes_written_p =
408+ static_cast <ssize_t >(write (devPtr_base, *size_p, *file_offset_p, *devPtr_offset_p));
409+ } else {
410+ CUFILE_TRY (cuFileAPI::instance ().WriteAsync (
411+ _handle, devPtr_base, size_p, file_offset_p, devPtr_offset_p, bytes_written_p, stream));
412+ }
413+ }
414+
415+ StreamFuture FileHandle::write_async (
416+ void * devPtr_base, std::size_t size, off_t file_offset, off_t devPtr_offset, CUstream stream)
417+ {
418+ StreamFuture ret (devPtr_base, size, file_offset, devPtr_offset, stream);
419+ auto [devPtr_base_, size_p, file_offset_p, devPtr_offset_p, bytes_written_p, stream_] =
420+ ret.get_args ();
421+ write_async (devPtr_base_, size_p, file_offset_p, devPtr_offset_p, bytes_written_p, stream_);
422+ return ret;
423+ }
424+
425+ bool FileHandle::is_compat_mode_preferred () const noexcept
426+ {
427+ return defaults::is_compat_mode_preferred (_compat_mode);
428+ }
429+
430+ bool FileHandle::is_compat_mode_preferred_for_async () const noexcept
431+ {
432+ static bool is_extra_symbol_available = is_stream_api_available ();
433+ static bool is_config_path_empty = config_path ().empty ();
434+ return is_compat_mode_preferred () || !is_extra_symbol_available || is_config_path_empty;
435+ }
436+
437+ bool FileHandle::is_compat_mode_preferred_for_async (CompatMode requested_compat_mode)
438+ {
439+ if (defaults::is_compat_mode_preferred (requested_compat_mode)) { return true ; }
440+
441+ if (!is_stream_api_available ()) {
442+ if (requested_compat_mode == CompatMode::AUTO) { return true ; }
443+ throw std::runtime_error (" Missing the cuFile stream api." );
444+ }
445+
446+ // When checking for availability, we also check if cuFile's config file exists. This is
447+ // because even when the stream API is available, it doesn't work if no config file exists.
448+ if (config_path ().empty ()) {
449+ if (requested_compat_mode == CompatMode::AUTO) { return true ; }
450+ throw std::runtime_error (" Missing cuFile configuration file." );
451+ }
452+ return false ;
453+ }
454+
175455} // namespace kvikio
0 commit comments