DX-90582: move 3 commits from gerrit (#4)

Antropovi · Paul Nienaber · web-flow · commit 42aa3df22a05 · 2025-01-14T14:28:32.000Z
Moving old commits with history

---------

Co-authored-by: Paul Nienaber &lt;paul.nienaber@dremio.com&gt;
diff --git a/flight_sql/CMakeLists.txt b/flight_sql/CMakeLists.txt
@@ -70,7 +70,6 @@ if (MSVC)
     -DCMAKE_DEPENDS_USE_COMPILER=FALSE
     -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR}/ApacheArrow-prefix/src/ApacheArrow-install
     -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
-    ${CMAKE_CURRENT_BINARY_DIR}/ApacheArrow-prefix/src/ApacheArrow/cpp
   )
 elseif(APPLE)
   set(ARROW_CMAKE_ARGS
@@ -90,7 +89,6 @@ elseif(APPLE)
           -DCMAKE_DEPENDS_USE_COMPILER=FALSE
           -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR}/ApacheArrow-prefix/src/ApacheArrow-install
           -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
-          ${CMAKE_CURRENT_BINARY_DIR}/ApacheArrow-prefix/src/ApacheArrow/cpp
           )
   if (DEFINED CMAKE_TOOLCHAIN_FILE)
     list(APPEND ARROW_CMAKE_ARGS -DARROW_DEPENDENCY_SOURCE=VCPKG)
@@ -109,7 +107,6 @@ else()
     -DCMAKE_DEPENDS_USE_COMPILER=FALSE
     -DOPENSSL_INCLUDE_DIR=${OPENSSL_INCLUDE_DIR}
     -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR}/ApacheArrow-prefix/src/ApacheArrow-install
-    ${CMAKE_CURRENT_BINARY_DIR}/ApacheArrow-prefix/src/ApacheArrow/cpp
   )
 endif()
 
@@ -120,6 +117,7 @@ message("Using Arrow from ${ARROW_GIT_REPOSITORY} on tag ${ARROW_GIT_TAG}")
 ExternalProject_Add(ApacheArrow
   GIT_REPOSITORY ${ARROW_GIT_REPOSITORY}
   GIT_TAG ${ARROW_GIT_TAG}
+  SOURCE_SUBDIR "cpp"
   CMAKE_ARGS ${ARROW_CMAKE_ARGS})
 
 include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/ApacheArrow-prefix/src/ApacheArrow-install/include)
diff --git a/flight_sql/accessors/date_array_accessor_test.cc b/flight_sql/accessors/date_array_accessor_test.cc
@@ -17,15 +17,23 @@ using namespace arrow;
 using namespace odbcabstraction;
 
 TEST(DateArrayAccessor, Test_Date32Array_CDataType_DATE) {
-  std::vector<int32_t> values = {7589, 12320, 18980, 19095};
+  std::vector<int32_t> values = {7589, 12320, 18980, 19095, -1, 0};
+  std::vector<DATE_STRUCT> expected = {
+    {1990, 10, 12},
+    {2003,  9, 25},
+    {2021, 12, 19},
+    {2022,  4, 13},
+    {1969, 12, 31},
+    {1970,  1,  1},
+  };
 
   std::shared_ptr<Array> array;
   ArrayFromVector<Date32Type, int32_t>(values, &array);
 
   DateArrayFlightSqlAccessor<CDataType_DATE, Date32Array> accessor(
       dynamic_cast<NumericArray<Date32Type> *>(array.get()));
 
-  std::vector<tagDATE_STRUCT> buffer(values.size());
+  std::vector<DATE_STRUCT> buffer(values.size());
   std::vector<ssize_t> strlen_buffer(values.size());
 
   ColumnBinding binding(CDataType_DATE, 0, 0, buffer.data(), 0, strlen_buffer.data());
@@ -37,27 +45,39 @@ TEST(DateArrayAccessor, Test_Date32Array_CDataType_DATE) {
 
   for (size_t i = 0; i < values.size(); ++i) {
     ASSERT_EQ(sizeof(DATE_STRUCT), strlen_buffer[i]);
-    tm date{};
 
-    int64_t converted_time = values[i] * 86400;
-    GetTimeForSecondsSinceEpoch(date, converted_time);
-    ASSERT_EQ((date.tm_year + 1900), buffer[i].year);
-    ASSERT_EQ(date.tm_mon + 1, buffer[i].month);
-    ASSERT_EQ(date.tm_mday, buffer[i].day);
+    ASSERT_EQ(expected[i].year, buffer[i].year);
+    ASSERT_EQ(expected[i].month, buffer[i].month);
+    ASSERT_EQ(expected[i].day, buffer[i].day);
   }
 }
 
 TEST(DateArrayAccessor, Test_Date64Array_CDataType_DATE) {
-  std::vector<int64_t> values = {86400000,  172800000, 259200000, 1649793238110,
-                                 345600000, 432000000, 518400000};
+  std::vector<int64_t> values = {86400000,  172800000, 259200000, 1649793238110, 0, 
+                                 345600000, 432000000, 518400000, -86400000, -17987443200000};
+  std::vector<DATE_STRUCT> expected = {
+    /* year(16), month(u16), day(u16) */
+    {1970,  1,  2},
+    {1970,  1,  3},
+    {1970,  1,  4},
+    {2022,  4, 12},
+    {1970,  1,  1},
+    {1970,  1,  5},
+    {1970,  1,  6},
+    {1970,  1,  7},
+    {1969, 12, 31},
+    // This is the documented lower limit of supported Gregorian dates for some parts of Boost,
+    // however boost::posix_time may go lower?
+    {1400,  1,  1},
+  };
 
   std::shared_ptr<Array> array;
   ArrayFromVector<Date64Type, int64_t>(values, &array);
 
   DateArrayFlightSqlAccessor<CDataType_DATE, Date64Array> accessor(
       dynamic_cast<NumericArray<Date64Type> *>(array.get()));
 
-  std::vector<tagDATE_STRUCT> buffer(values.size());
+  std::vector<DATE_STRUCT> buffer(values.size());
   std::vector<ssize_t> strlen_buffer(values.size());
 
   ColumnBinding binding(CDataType_DATE, 0, 0, buffer.data(), 0, strlen_buffer.data());
@@ -71,11 +91,9 @@ TEST(DateArrayAccessor, Test_Date64Array_CDataType_DATE) {
     ASSERT_EQ(sizeof(DATE_STRUCT), strlen_buffer[i]);
     tm date{};
 
-    int64_t converted_time = values[i] / 1000;
-    GetTimeForSecondsSinceEpoch(date, converted_time);
-    ASSERT_EQ((date.tm_year + 1900), buffer[i].year);
-    ASSERT_EQ(date.tm_mon + 1, buffer[i].month);
-    ASSERT_EQ(date.tm_mday, buffer[i].day);
+    ASSERT_EQ(expected[i].year, buffer[i].year);
+    ASSERT_EQ(expected[i].month, buffer[i].month);
+    ASSERT_EQ(expected[i].day, buffer[i].day);
   }
 }
 
diff --git a/flight_sql/accessors/timestamp_array_accessor.cc b/flight_sql/accessors/timestamp_array_accessor.cc
@@ -7,10 +7,13 @@
 #include "timestamp_array_accessor.h"
 #include "odbcabstraction/calendar_utils.h"
 
+#include <cmath>
+#include <limits>
+
 using namespace arrow;
 
 namespace {
-int64_t GetConversionToSecondsDivisor(TimeUnit::type unit) {
+inline int64_t GetConversionToSecondsDivisor(TimeUnit::type unit) {
   int64_t divisor = 1;
   switch (unit) {
     case TimeUnit::SECOND:
@@ -32,27 +35,25 @@ int64_t GetConversionToSecondsDivisor(TimeUnit::type unit) {
   return divisor;
 }
 
-uint32_t CalculateFraction(TimeUnit::type unit, uint64_t units_since_epoch) {
+uint32_t CalculateFraction(TimeUnit::type unit, int64_t units_since_epoch) {
   // Convert the given remainder and time unit to nanoseconds
   // since the fraction field on TIMESTAMP_STRUCT is in nanoseconds.
-  switch (unit) {
-  case TimeUnit::SECOND:
+  if (unit == TimeUnit::SECOND)
     return 0;
-  case TimeUnit::MILLI:
-    // 1000000 nanoseconds = 1 millisecond.
-    return (units_since_epoch %
-            driver::odbcabstraction::MILLI_TO_SECONDS_DIVISOR) *
-           1000000;
-  case TimeUnit::MICRO:
-    // 1000 nanoseconds = 1 microsecond.
-    return (units_since_epoch %
-           driver::odbcabstraction::MICRO_TO_SECONDS_DIVISOR) * 1000;
-  case TimeUnit::NANO:
-    // 1000 nanoseconds = 1 microsecond.
-    return (units_since_epoch %
-            driver::odbcabstraction::NANO_TO_SECONDS_DIVISOR);
-  }
-  return 0;
+  
+  const int64_t divisor = GetConversionToSecondsDivisor(unit);
+  const int64_t nano_divisor = GetConversionToSecondsDivisor(TimeUnit::NANO);
+
+  if (units_since_epoch < 0)
+    if (units_since_epoch <= (std::numeric_limits<decltype(units_since_epoch)>::min() + divisor))
+      // Prevent trying to derive and add a value larger than INT64_MAX (i.e. the time value at the start of
+      // the second which is used to shift the value positive before the modulo operation)) in next statement.
+      units_since_epoch += divisor;
+    // See below regarding floor division; here we want ceiling division.
+    // FIXME this goes poorly (trying to use a value > INT64_MAX when units_since_epoch is
+    // less than the smallest multiple of divisor greater than INT64_MIN.
+    units_since_epoch += divisor * std::abs((units_since_epoch - (divisor - 1)) / divisor);
+  return (units_since_epoch % divisor) * (nano_divisor / divisor);
 }
 } // namespace
 
@@ -72,11 +73,24 @@ TimestampArrayFlightSqlAccessor<TARGET_TYPE, UNIT>::MoveSingleCell_impl(
     ColumnBinding *binding, int64_t arrow_row, int64_t cell_counter,
     int64_t &value_offset, bool update_value_offset,
     odbcabstraction::Diagnostics &diagnostics) {
+  // Times less than the minimum integer number of seconds that can be represented
+  // for each time unit will not convert correctly.  This is mostly interesting for
+  // nanoseconds as timestamps in other units are outside of the accepted range of
+  // Gregorian dates.
   auto *buffer = static_cast<TIMESTAMP_STRUCT *>(binding->buffer);
 
   int64_t value = this->GetArray()->Value(arrow_row);
   const auto divisor = GetConversionToSecondsDivisor(UNIT);
-  const auto converted_result_seconds = value / divisor;
+  const auto converted_result_seconds =
+    // We want floor division here; C++ will round towards zero
+    (value < 0)
+    // Floor division: Shift all "fractional" (not a multiple of divisor) values so they round towards
+    // zero (and to the same value) along with the "floor" less than them, then add 1 to get back to
+    // the floor.  Althernative we could shift negatively by (divisor - 1) but this breaks near
+    // INT64_MIN causing underflow..
+    ? ((value + 1) / divisor) - 1
+    // Towards zero is already floor
+    : value / divisor;
   tm timestamp = {0};
 
   GetTimeForSecondsSinceEpoch(timestamp, converted_result_seconds);
diff --git a/flight_sql/accessors/timestamp_array_accessor_test.cc b/flight_sql/accessors/timestamp_array_accessor_test.cc
@@ -17,9 +17,27 @@ using namespace arrow;
 using namespace odbcabstraction;
 
 TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_MILLI) {
-  std::vector<int64_t> values = {86400370,  172800000, 259200000, 1649793238110LL,
-                                 345600000, 432000000, 518400000};
-
+  std::vector<int64_t> values = {86400370,  172800000, 259200000, 1649793238110LL, 345600000,
+                                 432000000, 518400000, -86399000, 0, -86399999, -86399001,
+                                 86400001, 86400999};
+  std::vector<TIMESTAMP_STRUCT> expected = {
+    /* year(16), month(u16), day(u16), hour(u16), minute(u16), second(u16), fraction(u32) */
+    {1970,  1,  2,  0,  0,  0, 370000000},
+    {1970,  1,  3,  0,  0,  0, 0},
+    {1970,  1,  4,  0,  0,  0, 0},
+    {2022,  4, 12, 19, 53, 58, 110000000},
+    {1970,  1,  5,  0,  0,  0, 0},
+    {1970,  1,  6,  0,  0,  0, 0},
+    {1970,  1,  7,  0,  0,  0, 0},
+    {1969, 12, 31,  0,  0,  1, 0},
+    {1970,  1,  1,  0,  0,  0, 0},
+    /* Tests both ends of the fraction rounding range to ensure we don't tip the wrong way */
+    {1969, 12, 31,  0,  0,  0,   1000000},
+    {1969, 12, 31,  0,  0,  0, 999000000},
+    {1970,  1,  2,  0,  0,  0,   1000000},
+    {1970,  1,  2,  0,  0,  0, 999000000},
+  };
+  
   std::shared_ptr<Array> timestamp_array;
 
   auto timestamp_field = field("timestamp_field", timestamp(TimeUnit::MILLI));
@@ -40,26 +58,31 @@ TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_MILLI) {
   for (size_t i = 0; i < values.size(); ++i) {
     ASSERT_EQ(sizeof(TIMESTAMP_STRUCT), strlen_buffer[i]);
 
-    tm date{};
-
-    auto converted_time = values[i] / MILLI_TO_SECONDS_DIVISOR;
-    GetTimeForSecondsSinceEpoch(date, converted_time);
-
-    ASSERT_EQ(buffer[i].year, 1900 + (date.tm_year));
-    ASSERT_EQ(buffer[i].month, date.tm_mon + 1);
-    ASSERT_EQ(buffer[i].day, date.tm_mday);
-    ASSERT_EQ(buffer[i].hour, date.tm_hour);
-    ASSERT_EQ(buffer[i].minute, date.tm_min);
-    ASSERT_EQ(buffer[i].second, date.tm_sec);
-
-    constexpr uint32_t NANOSECONDS_PER_MILLI = 1000000;
-    ASSERT_EQ(buffer[i].fraction, (values[i] % MILLI_TO_SECONDS_DIVISOR) * NANOSECONDS_PER_MILLI);
+    ASSERT_EQ(buffer[i].year, expected[i].year);
+    ASSERT_EQ(buffer[i].month, expected[i].month);
+    ASSERT_EQ(buffer[i].day, expected[i].day);
+    ASSERT_EQ(buffer[i].hour, expected[i].hour);
+    ASSERT_EQ(buffer[i].minute, expected[i].minute);
+    ASSERT_EQ(buffer[i].second, expected[i].second);
+    ASSERT_EQ(buffer[i].fraction, expected[i].fraction);
   }
 }
 
 TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_SECONDS) {
   std::vector<int64_t> values = {86400,  172800, 259200, 1649793238,
-                                 345600, 432000, 518400};
+                                 345600, 432000, 518400, -86399, 0};
+  std::vector<TIMESTAMP_STRUCT> expected = {
+    /* year(16), month(u16), day(u16), hour(u16), minute(u16), second(u16), fraction(u32) */
+    {1970,  1,  2,  0,  0,  0, 0},
+    {1970,  1,  3,  0,  0,  0, 0},
+    {1970,  1,  4,  0,  0,  0, 0},
+    {2022,  4, 12, 19, 53, 58, 0},
+    {1970,  1,  5,  0,  0,  0, 0},
+    {1970,  1,  6,  0,  0,  0, 0},
+    {1970,  1,  7,  0,  0,  0, 0},
+    {1969, 12, 31,  0,  0,  1, 0},
+    {1970,  1,  1,  0,  0,  0, 0},
+  };
 
   std::shared_ptr<Array> timestamp_array;
 
@@ -81,23 +104,27 @@ TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_SECONDS) {
 
   for (size_t i = 0; i < values.size(); ++i) {
     ASSERT_EQ(sizeof(TIMESTAMP_STRUCT), strlen_buffer[i]);
-    tm date{};
-
-    auto converted_time = values[i];
-    GetTimeForSecondsSinceEpoch(date, converted_time);
 
-    ASSERT_EQ(buffer[i].year, 1900 + (date.tm_year));
-    ASSERT_EQ(buffer[i].month, date.tm_mon + 1);
-    ASSERT_EQ(buffer[i].day, date.tm_mday);
-    ASSERT_EQ(buffer[i].hour, date.tm_hour);
-    ASSERT_EQ(buffer[i].minute, date.tm_min);
-    ASSERT_EQ(buffer[i].second, date.tm_sec);
+    ASSERT_EQ(buffer[i].year, expected[i].year);
+    ASSERT_EQ(buffer[i].month, expected[i].month);
+    ASSERT_EQ(buffer[i].day, expected[i].day);
+    ASSERT_EQ(buffer[i].hour, expected[i].hour);
+    ASSERT_EQ(buffer[i].minute, expected[i].minute);
+    ASSERT_EQ(buffer[i].second, expected[i].second);
     ASSERT_EQ(buffer[i].fraction, 0);
   }
 }
 
 TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_MICRO) {
-  std::vector<int64_t> values = {86400000000, 1649793238000000};
+  std::vector<int64_t> values = {0, 86400000000, 1649793238000000, -86399999999, -86399000001};
+  std::vector<TIMESTAMP_STRUCT> expected = {
+    /* year(16), month(u16), day(u16), hour(u16), minute(u16), second(u16), fraction(u32) */
+    {1970,  1,  1,  0,  0,  0, 0},
+    {1970,  1,  2,  0,  0,  0, 0},
+    {2022,  4, 12, 19, 53, 58, 0},
+    {1969, 12, 31,  0,  0,  0,      1000},
+    {1969, 12, 31,  0,  0,  0, 999999000},
+  };
 
   std::shared_ptr<Array> timestamp_array;
 
@@ -120,24 +147,31 @@ TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_MICRO) {
   for (size_t i = 0; i < values.size(); ++i) {
     ASSERT_EQ(sizeof(TIMESTAMP_STRUCT), strlen_buffer[i]);
 
-    tm date{};
-
-    auto converted_time = values[i] / MICRO_TO_SECONDS_DIVISOR;
-    GetTimeForSecondsSinceEpoch(date, converted_time);
-
-    ASSERT_EQ(buffer[i].year, 1900 + (date.tm_year));
-    ASSERT_EQ(buffer[i].month, date.tm_mon + 1);
-    ASSERT_EQ(buffer[i].day, date.tm_mday);
-    ASSERT_EQ(buffer[i].hour, date.tm_hour);
-    ASSERT_EQ(buffer[i].minute, date.tm_min);
-    ASSERT_EQ(buffer[i].second, date.tm_sec);
-    constexpr uint32_t MICROS_PER_NANO = 1000;
-    ASSERT_EQ(buffer[i].fraction, (values[i] % MICRO_TO_SECONDS_DIVISOR) * MICROS_PER_NANO);
+    ASSERT_EQ(buffer[i].year, expected[i].year);
+    ASSERT_EQ(buffer[i].month, expected[i].month);
+    ASSERT_EQ(buffer[i].day, expected[i].day);
+    ASSERT_EQ(buffer[i].hour, expected[i].hour);
+    ASSERT_EQ(buffer[i].minute, expected[i].minute);
+    ASSERT_EQ(buffer[i].second, expected[i].second);
+    ASSERT_EQ(buffer[i].fraction, expected[i].fraction);
   }
 }
 
 TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_NANO) {
-  std::vector<int64_t> values = {86400000010000, 1649793238000000000};
+  std::vector<int64_t> values = {86400000010000, 1649793238000000000, -86399999999999, -86399000000001,
+                                 86400000000001, 86400999999999, 0, -9223372036000000001};
+  std::vector<TIMESTAMP_STRUCT> expected = {
+    /* year(16), month(u16), day(u16), hour(u16), minute(u16), second(u16), fraction(u32) */
+    {1970,  1,  2,  0,  0,  0,     10000},
+    {2022,  4, 12, 19, 53, 58, 0},
+    {1969, 12, 31,  0,  0,  0,         1},
+    {1969, 12, 31,  0,  0,  0, 999999999},
+    {1970,  1,  2,  0,  0,  0,         1},
+    {1970,  1,  2,  0,  0,  0, 999999999},
+    {1970,  1,  1,  0,  0,  0, 0},
+    /* Test within range where floor (seconds) value is below INT64_MIN in nanoseconds */
+    {1677,  9, 21,  0, 12, 43, 999999999},
+  };
 
   std::shared_ptr<Array> timestamp_array;
 
@@ -159,19 +193,16 @@ TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_NANO) {
 
   for (size_t i = 0; i < values.size(); ++i) {
     ASSERT_EQ(sizeof(TIMESTAMP_STRUCT), strlen_buffer[i]);
-    tm date{};
-
-    auto converted_time = values[i] / NANO_TO_SECONDS_DIVISOR;
-    GetTimeForSecondsSinceEpoch(date, converted_time);
-
-    ASSERT_EQ(buffer[i].year, 1900 + (date.tm_year));
-    ASSERT_EQ(buffer[i].month, date.tm_mon + 1);
-    ASSERT_EQ(buffer[i].day, date.tm_mday);
-    ASSERT_EQ(buffer[i].hour, date.tm_hour);
-    ASSERT_EQ(buffer[i].minute, date.tm_min);
-    ASSERT_EQ(buffer[i].second, date.tm_sec);
-    ASSERT_EQ(buffer[i].fraction, (values[i] % NANO_TO_SECONDS_DIVISOR));
+
+    ASSERT_EQ(buffer[i].year, expected[i].year);
+    ASSERT_EQ(buffer[i].month, expected[i].month);
+    ASSERT_EQ(buffer[i].day, expected[i].day);
+    ASSERT_EQ(buffer[i].hour, expected[i].hour);
+    ASSERT_EQ(buffer[i].minute, expected[i].minute);
+    ASSERT_EQ(buffer[i].second, expected[i].second);
+    ASSERT_EQ(buffer[i].fraction, expected[i].fraction);
   }
 }
+
 } // namespace flight_sql
 } // namespace driver
diff --git a/odbcabstraction/calendar_utils.cc b/odbcabstraction/calendar_utils.cc
diff --git a/vcpkg.json b/vcpkg.json