diff --git a/src/iceberg/expression/json_serde.cc b/src/iceberg/expression/json_serde.cc index 0fd7dd01d..e59f60be8 100644 --- a/src/iceberg/expression/json_serde.cc +++ b/src/iceberg/expression/json_serde.cc @@ -17,22 +17,33 @@ * under the License. */ -#include #include #include -#include #include #include #include "iceberg/expression/json_serde_internal.h" #include "iceberg/expression/literal.h" +#include "iceberg/expression/predicate.h" +#include "iceberg/expression/term.h" +#include "iceberg/transform.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/json_util_internal.h" #include "iceberg/util/macros.h" +#include "iceberg/util/transform_util.h" namespace iceberg { namespace { +// JSON field names +constexpr std::string_view kType = "type"; +constexpr std::string_view kTerm = "term"; +constexpr std::string_view kTransform = "transform"; +constexpr std::string_view kValue = "value"; +constexpr std::string_view kValues = "values"; +constexpr std::string_view kLeft = "left"; +constexpr std::string_view kRight = "right"; +constexpr std::string_view kChild = "child"; // Expression type strings constexpr std::string_view kTypeTrue = "true"; constexpr std::string_view kTypeFalse = "false"; @@ -58,6 +69,53 @@ constexpr std::string_view kTypeCountNull = "count-null"; constexpr std::string_view kTypeCountStar = "count-star"; constexpr std::string_view kTypeMin = "min"; constexpr std::string_view kTypeMax = "max"; +constexpr std::string_view kTypeLiteral = "literal"; +constexpr std::string_view kTypeReference = "reference"; + +/// Helper to check if a JSON term represents a transform +bool IsTransformTerm(const nlohmann::json& json) { + return json.is_object() && json.contains(kType) && + json[kType].get() == kTransform && json.contains(kTerm); +} + +/// Template helper to create predicates from JSON with the appropriate term type +template +Result> MakePredicateFromJson( + Expression::Operation op, std::shared_ptr> term, + const nlohmann::json& json) { + if (IsUnaryOperation(op)) { + if (json.contains(kValue)) [[unlikely]] { + return JsonParseError("Unary predicate has invalid 'value' field: {}", + SafeDumpJson(json)); + } + if (json.contains(kValues)) [[unlikely]] { + return JsonParseError("Unary predicate has invalid 'values' field: {}", + SafeDumpJson(json)); + } + return UnboundPredicateImpl::Make(op, std::move(term)); + } + + if (IsSetOperation(op)) { + std::vector literals; + if (!json.contains(kValues) || !json[kValues].is_array()) [[unlikely]] { + return JsonParseError("Missing or invalid 'values' field for set operation: {}", + SafeDumpJson(json)); + } + for (const auto& val : json[kValues]) { + ICEBERG_ASSIGN_OR_RAISE(auto lit, LiteralFromJson(val)); + literals.push_back(std::move(lit)); + } + return UnboundPredicateImpl::Make(op, std::move(term), std::move(literals)); + } + + // Literal predicate + if (!json.contains(kValue)) [[unlikely]] { + return JsonParseError("Missing 'value' field for literal predicate: {}", + SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE(auto literal, LiteralFromJson(json[kValue])); + return UnboundPredicateImpl::Make(op, std::move(term), std::move(literal)); +} } // namespace bool IsUnaryOperation(Expression::Operation op) { @@ -83,7 +141,7 @@ bool IsSetOperation(Expression::Operation op) { } Result OperationTypeFromJson(const nlohmann::json& json) { - if (!json.is_string()) { + if (!json.is_string()) [[unlikely]] { return JsonParseError("Unable to create operation. Json value is not a string"); } auto typeStr = json.get(); @@ -123,27 +181,252 @@ nlohmann::json ToJson(Expression::Operation op) { return json; } +nlohmann::json ToJson(const NamedReference& ref) { return ref.name(); } + +Result> NamedReferenceFromJson( + const nlohmann::json& json) { + if (json.is_object() && json.contains(kType) && + json[kType].get() == kTypeReference && json.contains(kTerm)) { + return NamedReference::Make(json[kTerm].get()); + } + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Expected string for named reference"); + } + return NamedReference::Make(json.get()); +} + +nlohmann::json ToJson(const UnboundTransform& transform) { + auto& mutable_transform = const_cast(transform); + nlohmann::json json; + json[kType] = kTransform; + json[kTransform] = transform.transform()->ToString(); + json[kTerm] = mutable_transform.reference()->name(); + return json; +} + +Result> UnboundTransformFromJson( + const nlohmann::json& json) { + if (IsTransformTerm(json)) { + ICEBERG_ASSIGN_OR_RAISE(auto transform_str, + GetJsonValue(json, kTransform)); + ICEBERG_ASSIGN_OR_RAISE(auto transform, TransformFromString(transform_str)); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReferenceFromJson(json[kTerm])); + return UnboundTransform::Make(std::move(ref), std::move(transform)); + } + return JsonParseError("Invalid unbound transform json: {}", SafeDumpJson(json)); +} + +nlohmann::json ToJson(const Literal& literal) { + if (literal.IsNull()) { + return nullptr; + } + + const auto type_id = literal.type()->type_id(); + const auto& value = literal.value(); + + switch (type_id) { + case TypeId::kBoolean: + return std::get(value); + case TypeId::kInt: + return std::get(value); + case TypeId::kDate: + return TransformUtil::HumanDay(std::get(value)); + case TypeId::kLong: + return std::get(value); + case TypeId::kTime: + return TransformUtil::HumanTime(std::get(value)); + case TypeId::kTimestamp: + return TransformUtil::HumanTimestamp(std::get(value)); + case TypeId::kTimestampTz: + return TransformUtil::HumanTimestampWithZone(std::get(value)); + case TypeId::kFloat: + return std::get(value); + case TypeId::kDouble: + return std::get(value); + case TypeId::kString: + return std::get(value); + case TypeId::kBinary: + case TypeId::kFixed: { + const auto& bytes = std::get>(value); + std::string hex; + hex.reserve(bytes.size() * 2); + for (uint8_t byte : bytes) { + hex += std::format("{:02X}", byte); + } + return hex; + } + case TypeId::kDecimal: { + return literal.ToString(); + } + case TypeId::kUuid: + return std::get(value).ToString(); + default: + nlohmann::json json; + return json; + } +} + +Result LiteralFromJson(const nlohmann::json& json) { + // Unwrap {"type": "literal", "value": } wrapper + if (json.is_object() && json.contains(kType) && + json[kType].get() == kTypeLiteral && json.contains(kValue)) { + return LiteralFromJson(json[kValue]); + } + if (json.is_null()) { + return Literal::Null(nullptr); + } + if (json.is_boolean()) { + return Literal::Boolean(json.get()); + } + if (json.is_number_integer()) { + return Literal::Long(json.get()); + } + if (json.is_number_float()) { + return Literal::Double(json.get()); + } + if (json.is_string()) { + // All strings are returned as String literals. + // Conversion to binary/date/time/etc. happens during binding + // when schema type information is available. + return Literal::String(json.get()); + } + return JsonParseError("Unsupported literal JSON type"); +} + +nlohmann::json TermToJson(const Term& term) { + switch (term.kind()) { + case Term::Kind::kReference: + return ToJson(static_cast(term)); + case Term::Kind::kTransform: + return ToJson(static_cast(term)); + default: + return nullptr; + } +} + +nlohmann::json ToJson(const UnboundPredicate& pred) { + nlohmann::json json; + json[kType] = ToJson(pred.op()); + + // Get term and literals by casting to the appropriate impl type + std::span literals; + + if (auto* ref_pred = dynamic_cast*>(&pred)) { + json[kTerm] = TermToJson(*ref_pred->term()); + literals = ref_pred->literals(); + } else if (auto* transform_pred = + dynamic_cast*>(&pred)) { + json[kTerm] = TermToJson(*transform_pred->term()); + literals = transform_pred->literals(); + } + + if (!IsUnaryOperation(pred.op())) { + if (IsSetOperation(pred.op())) { + nlohmann::json values = nlohmann::json::array(); + for (const auto& lit : literals) { + values.push_back(ToJson(lit)); + } + json[kValues] = std::move(values); + } else if (!literals.empty()) { + json[kValue] = ToJson(literals[0]); + } + } + return json; +} + +Result> UnboundPredicateFromJson( + const nlohmann::json& json) { + ICEBERG_ASSIGN_OR_RAISE(auto op, OperationTypeFromJson(json[kType])); + + const auto& term_json = json[kTerm]; + + if (IsTransformTerm(term_json)) { + ICEBERG_ASSIGN_OR_RAISE(auto term, UnboundTransformFromJson(term_json)); + return MakePredicateFromJson(op, std::move(term), json); + } + + ICEBERG_ASSIGN_OR_RAISE(auto term, NamedReferenceFromJson(term_json)); + return MakePredicateFromJson(op, std::move(term), json); +} + Result> ExpressionFromJson(const nlohmann::json& json) { - // Handle boolean + // Handle boolean constants if (json.is_boolean()) { return json.get() ? internal::checked_pointer_cast(True::Instance()) : internal::checked_pointer_cast(False::Instance()); } - return JsonParseError("Only booleans are currently supported."); + + if (!json.is_object()) [[unlikely]] { + return JsonParseError("Expression must be boolean or object"); + } + + ICEBERG_ASSIGN_OR_RAISE(auto op, OperationTypeFromJson(json[kType])); + + switch (op) { + case Expression::Operation::kAnd: { + if (!json.contains(kLeft) || !json.contains(kRight)) [[unlikely]] { + return JsonParseError("AND expression missing 'left' or 'right' field"); + } + ICEBERG_ASSIGN_OR_RAISE(auto left, ExpressionFromJson(json[kLeft])); + ICEBERG_ASSIGN_OR_RAISE(auto right, ExpressionFromJson(json[kRight])); + ICEBERG_ASSIGN_OR_RAISE(auto result, And::Make(std::move(left), std::move(right))); + return std::shared_ptr(std::move(result)); + } + case Expression::Operation::kOr: { + if (!json.contains(kLeft) || !json.contains(kRight)) [[unlikely]] { + return JsonParseError("OR expression missing 'left' or 'right' field"); + } + ICEBERG_ASSIGN_OR_RAISE(auto left, ExpressionFromJson(json[kLeft])); + ICEBERG_ASSIGN_OR_RAISE(auto right, ExpressionFromJson(json[kRight])); + ICEBERG_ASSIGN_OR_RAISE(auto result, Or::Make(std::move(left), std::move(right))); + return std::shared_ptr(std::move(result)); + } + case Expression::Operation::kNot: { + if (!json.contains(kChild)) [[unlikely]] { + return JsonParseError("NOT expression missing 'child' field"); + } + ICEBERG_ASSIGN_OR_RAISE(auto child, ExpressionFromJson(json[kChild])); + ICEBERG_ASSIGN_OR_RAISE(auto result, Not::Make(std::move(child))); + return std::shared_ptr(std::move(result)); + } + default: + // All other operations are predicates + return UnboundPredicateFromJson(json); + } } nlohmann::json ToJson(const Expression& expr) { switch (expr.op()) { case Expression::Operation::kTrue: return true; - case Expression::Operation::kFalse: return false; + case Expression::Operation::kAnd: { + const auto& and_expr = static_cast(expr); + nlohmann::json json; + json[kType] = ToJson(expr.op()); + json[kLeft] = ToJson(*and_expr.left()); + json[kRight] = ToJson(*and_expr.right()); + return json; + } + case Expression::Operation::kOr: { + const auto& or_expr = static_cast(expr); + nlohmann::json json; + json[kType] = ToJson(expr.op()); + json[kLeft] = ToJson(*or_expr.left()); + json[kRight] = ToJson(*or_expr.right()); + return json; + } + case Expression::Operation::kNot: { + const auto& not_expr = static_cast(expr); + nlohmann::json json; + json[kType] = ToJson(expr.op()); + json[kChild] = ToJson(*not_expr.child()); + return json; + } default: - // TODO(evindj): This code will be removed as we implemented the full expression - // serialization. - ICEBERG_CHECK_OR_DIE(false, "Only booleans are currently supported."); + return ToJson(dynamic_cast(expr)); } } diff --git a/src/iceberg/expression/json_serde_internal.h b/src/iceberg/expression/json_serde_internal.h index e44234d39..98233d542 100644 --- a/src/iceberg/expression/json_serde_internal.h +++ b/src/iceberg/expression/json_serde_internal.h @@ -19,6 +19,8 @@ #pragma once +#include + #include #include "iceberg/expression/expression.h" @@ -57,6 +59,63 @@ ICEBERG_EXPORT Result> ExpressionFromJson( /// \return A JSON object representing the expression ICEBERG_EXPORT nlohmann::json ToJson(const Expression& expr); +/// \brief Deserializes a JSON object into a NamedReference. +/// +/// \param json A JSON object representing a named reference +/// \return A shared pointer to the deserialized NamedReference or an error +ICEBERG_EXPORT Result> NamedReferenceFromJson( + const nlohmann::json& json); + +/// \brief Serializes a NamedReference into its JSON representation. +/// +/// \param ref The named reference to serialize +/// \return A JSON object representing the named reference +ICEBERG_EXPORT nlohmann::json ToJson(const NamedReference& ref); + +/// \brief Serializes an UnboundTransform into its JSON representation. +/// +/// \param transform The unbound transform to serialize +/// \return A JSON object representing the unbound transform +ICEBERG_EXPORT nlohmann::json ToJson(const UnboundTransform& transform); + +/// \brief Deserializes a JSON object into an UnboundTransform. +/// +/// \param json A JSON object representing an unbound transform +/// \return A shared pointer to the deserialized UnboundTransform or an error +ICEBERG_EXPORT Result> UnboundTransformFromJson( + const nlohmann::json& json); + +/// \brief Serializes a Literal into its JSON representation. +/// +/// \param literal The literal to serialize +/// \return A JSON value representing the literal +ICEBERG_EXPORT nlohmann::json ToJson(const Literal& literal); + +/// \brief Deserializes a JSON value into a Literal. +/// +/// \param json A JSON value representing a literal +/// \return The deserialized Literal or an error +ICEBERG_EXPORT Result LiteralFromJson(const nlohmann::json& json); + +/// \brief Serializes an UnboundPredicate into its JSON representation. +/// +/// \param pred The unbound predicate to serialize +/// \return A JSON object representing the predicate +ICEBERG_EXPORT nlohmann::json ToJson(const UnboundPredicate& pred); + +/// \brief Deserializes a JSON object into an UnboundPredicate. +/// +/// \param json A JSON object representing an unbound predicate +/// \return A shared pointer to the deserialized UnboundPredicate or an error +ICEBERG_EXPORT Result> UnboundPredicateFromJson( + const nlohmann::json& json); + +/// \brief Serializes a Term into its JSON representation. +/// +/// \param term The term to serialize (NamedReference or UnboundTransform) +/// \return A JSON value representing the term +ICEBERG_EXPORT nlohmann::json TermToJson(const Term& term); + /// Check if an operation is a unary predicate ICEBERG_EXPORT bool IsUnaryOperation(Expression::Operation op); diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index 88bafd78d..22617d32a 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -23,14 +23,46 @@ #include #include #include +#include +#include "iceberg/type.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/conversions.h" +#include "iceberg/util/decimal.h" #include "iceberg/util/macros.h" #include "iceberg/util/temporal_util.h" +#include "iceberg/util/transform_util.h" namespace iceberg { +namespace { +Result> HexStringToBytes(std::string_view hex) { + if (hex.length() % 2 != 0) { + return InvalidArgument("Hex string must have an even length"); + } + + std::vector bytes; + bytes.reserve(hex.length() / 2); + + auto to_nibble = [](char c) -> uint8_t { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + throw std::invalid_argument("Invalid hex character"); + }; + + for (size_t i = 0; i < hex.length(); i += 2) { + try { + bytes.push_back( + static_cast((to_nibble(hex[i]) << 4) | to_nibble(hex[i + 1]))); + } catch (const std::invalid_argument& e) { + return InvalidArgument("Invalid hex character in string: {}", e.what()); + } + } + return bytes; +} +} // namespace + /// \brief LiteralCaster handles type casting operations for Literal. /// This is an internal implementation class. class LiteralCaster { @@ -193,12 +225,36 @@ Result LiteralCaster::CastFromString( ICEBERG_ASSIGN_OR_RAISE(auto uuid, Uuid::FromString(str_val)); return Literal::UUID(uuid); } - case TypeId::kDate: - case TypeId::kTime: - case TypeId::kTimestamp: - case TypeId::kTimestampTz: - return NotImplemented("Cast from String to {} is not implemented yet", - target_type->ToString()); + case TypeId::kDate: { + ICEBERG_ASSIGN_OR_RAISE(auto days, TransformUtil::ParseDay(str_val)); + return Literal::Date(days); + } + case TypeId::kTime: { + ICEBERG_ASSIGN_OR_RAISE(auto micros, TransformUtil::ParseTime(str_val)); + return Literal::Time(micros); + } + case TypeId::kTimestamp: { + ICEBERG_ASSIGN_OR_RAISE(auto micros, TransformUtil::ParseTimestamp(str_val)); + return Literal::Timestamp(micros); + } + case TypeId::kTimestampTz: { + ICEBERG_ASSIGN_OR_RAISE(auto micros, + TransformUtil::ParseTimestampWithZone(str_val)); + return Literal::TimestampTz(micros); + } + case TypeId::kBinary: { + ICEBERG_ASSIGN_OR_RAISE(auto bytes, HexStringToBytes(str_val)); + return Literal::Binary(std::move(bytes)); + } + case TypeId::kFixed: { + ICEBERG_ASSIGN_OR_RAISE(auto bytes, HexStringToBytes(str_val)); + return Literal::Fixed(std::move(bytes)); + } + case TypeId::kDecimal: { + const auto& dec_type = internal::checked_cast(*target_type); + ICEBERG_ASSIGN_OR_RAISE(auto dec, Decimal::FromString(str_val)); + return Literal::Decimal(dec.value(), dec_type.precision(), dec_type.scale()); + } default: return NotSupported("Cast from String to {} is not supported", target_type->ToString()); diff --git a/src/iceberg/test/expression_json_test.cc b/src/iceberg/test/expression_json_test.cc index dd3ac5e3e..1851946d3 100644 --- a/src/iceberg/test/expression_json_test.cc +++ b/src/iceberg/test/expression_json_test.cc @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -25,42 +26,210 @@ #include #include "iceberg/expression/expression.h" -#include "iceberg/expression/expressions.h" #include "iceberg/expression/json_serde_internal.h" #include "iceberg/expression/literal.h" -#include "iceberg/expression/predicate.h" #include "iceberg/expression/term.h" #include "iceberg/test/matchers.h" +#include "iceberg/transform.h" +#include "iceberg/util/uuid.h" namespace iceberg { -// Test boolean constant expressions -TEST(ExpressionJsonTest, CheckBooleanExpression) { - auto checkBoolean = [](std::shared_ptr expr, bool value) { - auto json = ToJson(*expr); - EXPECT_TRUE(json.is_boolean()); - EXPECT_EQ(json.get(), value); - - auto result = ExpressionFromJson(json); - ASSERT_THAT(result, IsOk()); - if (value) { - EXPECT_EQ(result.value()->op(), Expression::Operation::kTrue); - } else { - EXPECT_EQ(result.value()->op(), Expression::Operation::kFalse); - } - }; - checkBoolean(True::Instance(), true); - checkBoolean(False::Instance(), false); +struct ExpressionJsonRoundTripParam { + std::string name; + nlohmann::json json; + Expression::Operation expected_op; +}; + +class ExpressionJsonRoundTripTest + : public ::testing::TestWithParam {}; + +TEST_P(ExpressionJsonRoundTripTest, RoundTrip) { + const auto& param = GetParam(); + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(param.json)); + EXPECT_EQ(expr->op(), param.expected_op); + auto round_trip = ToJson(*expr); + EXPECT_EQ(round_trip, param.json); } -TEST(ExpressionJsonTest, OperationTypeTests) { - EXPECT_EQ(OperationTypeFromJson("true"), Expression::Operation::kTrue); - EXPECT_EQ("true", ToJson(Expression::Operation::kTrue)); - EXPECT_TRUE(IsSetOperation(Expression::Operation::kIn)); - EXPECT_FALSE(IsSetOperation(Expression::Operation::kTrue)); +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, ExpressionJsonRoundTripTest, + ::testing::Values( + ExpressionJsonRoundTripParam{"BooleanTrue", true, Expression::Operation::kTrue}, + ExpressionJsonRoundTripParam{"BooleanFalse", false, + Expression::Operation::kFalse}, + ExpressionJsonRoundTripParam{"UnaryIsNull", + {{"type", "is-null"}, {"term", "col"}}, + Expression::Operation::kIsNull}, + ExpressionJsonRoundTripParam{"LiteralGt", + {{"type", "gt"}, {"term", "age"}, {"value", 21}}, + Expression::Operation::kGt}, + ExpressionJsonRoundTripParam{ + "SetIn", + {{"type", "in"}, + {"term", "status"}, + {"values", nlohmann::json::array({"active", "pending"})}}, + Expression::Operation::kIn}, + ExpressionJsonRoundTripParam{ + "AndExpression", + {{"type", "and"}, + {"left", {{"type", "gt"}, {"term", "age"}, {"value", 18}}}, + {"right", {{"type", "lt"}, {"term", "age"}, {"value", 65}}}}, + Expression::Operation::kAnd}, + ExpressionJsonRoundTripParam{ + "NotExpression", + {{"type", "not"}, {"child", {{"type", "is-null"}, {"term", "name"}}}}, + Expression::Operation::kNot}, + ExpressionJsonRoundTripParam{ + "TransformDay", + {{"type", "eq"}, + {"term", {{"type", "transform"}, {"transform", "day"}, {"term", "ts"}}}, + {"value", 19738}}, + Expression::Operation::kEq}, + ExpressionJsonRoundTripParam{ + "TransformYear", + {{"type", "gt"}, + {"term", + {{"type", "transform"}, {"transform", "year"}, {"term", "timestamp_col"}}}, + {"value", 2020}}, + Expression::Operation::kGt}, + ExpressionJsonRoundTripParam{ + "TransformTruncate", + {{"type", "lt"}, + {"term", + {{"type", "transform"}, {"transform", "truncate[4]"}, {"term", "col"}}}, + {"value", 100}}, + Expression::Operation::kLt}, + ExpressionJsonRoundTripParam{ + "LiteralNotEq", + {{"type", "not-eq"}, {"term", "status"}, {"value", "closed"}}, + Expression::Operation::kNotEq}, + ExpressionJsonRoundTripParam{ + "LiteralLtEq", + {{"type", "lt-eq"}, {"term", "price"}, {"value", 100}}, + Expression::Operation::kLtEq}, + ExpressionJsonRoundTripParam{ + "LiteralGtEq", + {{"type", "gt-eq"}, {"term", "quantity"}, {"value", 1}}, + Expression::Operation::kGtEq}, + ExpressionJsonRoundTripParam{ + "SetNotIn", + {{"type", "not-in"}, + {"term", "category"}, + {"values", nlohmann::json::array({"archived", "deleted"})}}, + Expression::Operation::kNotIn}, + ExpressionJsonRoundTripParam{"UnaryNotNan", + {{"type", "not-nan"}, {"term", "score"}}, + Expression::Operation::kNotNan}, + ExpressionJsonRoundTripParam{ + "LiteralStartsWith", + {{"type", "starts-with"}, {"term", "name"}, {"value", "prefix"}}, + Expression::Operation::kStartsWith}, + ExpressionJsonRoundTripParam{ + "LiteralNotStartsWith", + {{"type", "not-starts-with"}, {"term", "name"}, {"value", "bad"}}, + Expression::Operation::kNotStartsWith}, + ExpressionJsonRoundTripParam{ + "OrExpression", + {{"type", "or"}, + {"left", {{"type", "lt"}, {"term", "price"}, {"value", 50}}}, + {"right", {{"type", "not-null"}, {"term", "discount"}}}}, + Expression::Operation::kOr}, + ExpressionJsonRoundTripParam{ + "NestedWithDecimals", + {{"type", "or"}, + {"left", + {{"type", "and"}, + {"left", + {{"type", "in"}, + {"term", "price"}, + {"values", nlohmann::json::array({3.14, 2.72})}}}, + {"right", {{"type", "eq"}, {"term", "currency"}, {"value", "USD"}}}}}, + {"right", {{"type", "is-nan"}, {"term", "discount"}}}}, + Expression::Operation::kOr}, + ExpressionJsonRoundTripParam{ + "FixedBinaryInPredicate", + {{"type", "eq"}, {"term", "col"}, {"value", "010203"}}, + Expression::Operation::kEq}, + ExpressionJsonRoundTripParam{"ScaleDecimalInSet", + {{"type", "in"}, + {"term", "amount"}, + {"values", nlohmann::json::array({"3.14E+4"})}}, + Expression::Operation::kIn}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +// -- Object wrapper normalization tests -- - EXPECT_TRUE(IsUnaryOperation(Expression::Operation::kIsNull)); - EXPECT_FALSE(IsUnaryOperation(Expression::Operation::kTrue)); +TEST(ExpressionJsonTest, PredicateWithObjectLiteral) { + nlohmann::json input = {{"type", "lt-eq"}, + {"term", "col"}, + {"value", {{"type", "literal"}, {"value", 50}}}}; + nlohmann::json expected = {{"type", "lt-eq"}, {"term", "col"}, {"value", 50}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(input)); + EXPECT_EQ(ToJson(*expr), expected); } +TEST(ExpressionJsonTest, PredicateWithObjectReference) { + nlohmann::json input = {{"type", "lt-eq"}, + {"term", {{"type", "reference"}, {"term", "col"}}}, + {"value", 50}}; + nlohmann::json expected = {{"type", "lt-eq"}, {"term", "col"}, {"value", 50}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(input)); + EXPECT_EQ(ToJson(*expr), expected); +} + +// -- Parameterized invalid expression tests -- + +struct InvalidExpressionParam { + std::string name; + nlohmann::json json; + std::string expected_error_substr; +}; + +class InvalidExpressionTest : public ::testing::TestWithParam {}; + +TEST_P(InvalidExpressionTest, ReturnsError) { + const auto& param = GetParam(); + auto result = ExpressionFromJson(param.json); + EXPECT_THAT(result, HasErrorMessage(param.expected_error_substr)); +} + +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, InvalidExpressionTest, + ::testing::Values( + InvalidExpressionParam{"NotBooleanOrObject", 42, "boolean or object"}, + InvalidExpressionParam{"UnknownOperationType", + {{"type", "illegal"}, {"term", "col"}}, + "Unknown expression type"}, + InvalidExpressionParam{ + "AndMissingLeft", + {{"type", "and"}, {"right", {{"type", "is-null"}, {"term", "col"}}}}, + "missing 'left' or 'right'"}, + InvalidExpressionParam{ + "OrMissingRight", + {{"type", "or"}, {"left", {{"type", "is-null"}, {"term", "col"}}}}, + "missing 'left' or 'right'"}, + InvalidExpressionParam{"NotMissingChild", {{"type", "not"}}, "missing 'child'"}, + InvalidExpressionParam{"UnaryWithSpuriousValue", + {{"type", "not-nan"}, {"term", "col"}, {"value", 42}}, + "invalid 'value' field"}, + InvalidExpressionParam{"UnaryWithSpuriousValues", + {{"type", "is-nan"}, + {"term", "col"}, + {"values", nlohmann::json::array({1, 2})}}, + "invalid 'values' field"}, + InvalidExpressionParam{"NumericTerm", + {{"type", "lt"}, {"term", 23}, {"value", 10}}, + "Expected string for named reference"}, + InvalidExpressionParam{"SetMissingValues", + {{"type", "in"}, {"term", "col"}, {"value", 42}}, + "values"}, + InvalidExpressionParam{ + "LiteralMissingValue", {{"type", "gt"}, {"term", "col"}}, "value"}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + } // namespace iceberg diff --git a/src/iceberg/test/literal_test.cc b/src/iceberg/test/literal_test.cc index 01a7a7ce6..97724aad9 100644 --- a/src/iceberg/test/literal_test.cc +++ b/src/iceberg/test/literal_test.cc @@ -787,6 +787,37 @@ INSTANTIATE_TEST_SUITE_P( .target_type = uuid(), .expected_literal = Literal::UUID( Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value())}, + CastLiteralTestParam{.test_name = "StringToDate", + .source_literal = Literal::String("2024-01-16"), + .target_type = date(), + .expected_literal = Literal::Date(19738)}, + CastLiteralTestParam{.test_name = "StringToTime", + .source_literal = Literal::String("14:30"), + .target_type = time(), + .expected_literal = Literal::Time(52200000000LL)}, + CastLiteralTestParam{.test_name = "StringToTimestamp", + .source_literal = Literal::String("2026-01-01T00:00:01.500"), + .target_type = timestamp(), + .expected_literal = Literal::Timestamp(1767225601500000L)}, + CastLiteralTestParam{ + .test_name = "StringToTimestampTz", + .source_literal = Literal::String("2026-01-01T00:00:01.500+00:00"), + .target_type = timestamp_tz(), + .expected_literal = Literal::TimestampTz(1767225601500000L)}, + CastLiteralTestParam{.test_name = "StringToBinary", + .source_literal = Literal::String("010203FF"), + .target_type = binary(), + .expected_literal = Literal::Binary(std::vector{ + 0x01, 0x02, 0x03, 0xFF})}, + CastLiteralTestParam{.test_name = "StringToFixed", + .source_literal = Literal::String("01020304"), + .target_type = fixed(4), + .expected_literal = Literal::Fixed(std::vector{ + 0x01, 0x02, 0x03, 0x04})}, + CastLiteralTestParam{.test_name = "StringToDecimal", + .source_literal = Literal::String("1234.56"), + .target_type = decimal(6, 2), + .expected_literal = Literal::Decimal(123456, 6, 2)}, // Same type cast test CastLiteralTestParam{.test_name = "IntToInt", .source_literal = Literal::Int(42), diff --git a/src/iceberg/test/transform_util_test.cc b/src/iceberg/test/transform_util_test.cc index 76f6824b3..48a455973 100644 --- a/src/iceberg/test/transform_util_test.cc +++ b/src/iceberg/test/transform_util_test.cc @@ -21,6 +21,8 @@ #include +#include "iceberg/test/matchers.h" + namespace iceberg { TEST(TransformUtilTest, HumanYear) { @@ -157,4 +159,97 @@ TEST(TransformUtilTest, Base64Encode) { EXPECT_EQ("AA==", TransformUtil::Base64Encode({"\x00", 1})); } +struct ParseRoundTripParam { + std::string name; + std::string str; + int64_t value; + enum Kind { kDay, kTime, kTimestamp, kTimestampTz } kind; +}; + +class ParseRoundTripTest : public ::testing::TestWithParam {}; + +TEST_P(ParseRoundTripTest, RoundTrip) { + const auto& param = GetParam(); + switch (param.kind) { + case ParseRoundTripParam::kDay: { + EXPECT_EQ(TransformUtil::HumanDay(static_cast(param.value)), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseDay(param.str)); + EXPECT_EQ(parsed, static_cast(param.value)); + break; + } + case ParseRoundTripParam::kTime: { + EXPECT_EQ(TransformUtil::HumanTime(param.value), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTime(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + case ParseRoundTripParam::kTimestamp: { + EXPECT_EQ(TransformUtil::HumanTimestamp(param.value), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTimestamp(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + case ParseRoundTripParam::kTimestampTz: { + EXPECT_EQ(TransformUtil::HumanTimestampWithZone(param.value), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, + TransformUtil::ParseTimestampWithZone(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + } +} + +INSTANTIATE_TEST_SUITE_P( + TransformUtilTest, ParseRoundTripTest, + ::testing::Values( + // Day round-trips + ParseRoundTripParam{"DayEpoch", "1970-01-01", 0, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayNext", "1970-01-02", 1, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayBeforeEpoch", "1969-12-31", -1, + ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayYear999", "0999-12-31", -354286, + ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayNonLeap", "1971-01-01", 365, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayY2K", "2000-01-01", 10957, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"Day2026", "2026-01-01", 20454, ParseRoundTripParam::kDay}, + // Time round-trips + ParseRoundTripParam{"TimeMidnight", "00:00", 0, ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeOneSec", "00:00:01", 1000000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeMillis", "00:00:01.500", 1500000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeOneMillis", "00:00:01.001", 1001000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeMicros", "00:00:01.000001", 1000001, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeHourMinSec", "01:02:03", 3723000000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeEndOfDay", "23:59:59", 86399000000, + ParseRoundTripParam::kTime}, + // Timestamp round-trips + ParseRoundTripParam{"TimestampEpoch", "1970-01-01T00:00:00", 0, + ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampOneSec", "1970-01-01T00:00:01", 1000000, + ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampMillis", "2026-01-01T00:00:01.500", + 1767225601500000L, ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampOneMillis", "2026-01-01T00:00:01.001", + 1767225601001000L, ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampMicros", "2026-01-01T00:00:01.000001", + 1767225601000001L, ParseRoundTripParam::kTimestamp}, + // TimestampTz round-trips + ParseRoundTripParam{"TimestampTzEpoch", "1970-01-01T00:00:00+00:00", 0, + ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzOneSec", "1970-01-01T00:00:01+00:00", 1000000, + ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzMillis", "2026-01-01T00:00:01.500+00:00", + 1767225601500000L, ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzOneMillis", "2026-01-01T00:00:01.001+00:00", + 1767225601001000L, ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzMicros", "2026-01-01T00:00:01.000001+00:00", + 1767225601000001L, ParseRoundTripParam::kTimestampTz}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + } // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index e97de0ac5..491775ee3 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -129,8 +129,11 @@ class BoundReference; class BoundTransform; class Expression; class Literal; +class NamedReference; class Term; +class Transform; class UnboundPredicate; +class UnboundTransform; /// \brief Evaluator. class Evaluator; diff --git a/src/iceberg/util/transform_util.cc b/src/iceberg/util/transform_util.cc index fe1523437..42892835f 100644 --- a/src/iceberg/util/transform_util.cc +++ b/src/iceberg/util/transform_util.cc @@ -20,14 +20,33 @@ #include "iceberg/util/transform_util.h" #include +#include #include +#include "iceberg/util/macros.h" + namespace iceberg { namespace { constexpr auto kEpochDate = std::chrono::year{1970} / std::chrono::January / 1; constexpr int64_t kMicrosPerMillis = 1'000; constexpr int64_t kMicrosPerSecond = 1'000'000; +constexpr int64_t kMicrosPerDay = 86'400'000'000LL; + +/// Parse fractional seconds (after '.') and return micros. +/// Accepts 1-6 digits, zero-padded on the right to 6 digits. +Result ParseFractionalMicros(std::string_view frac) { + int32_t val = 0; + auto [_, ec] = std::from_chars(frac.data(), frac.data() + frac.size(), val); + if (frac.empty() || frac.size() > 6 || ec != std::errc{}) { + return InvalidArgument("Invalid fractional seconds: '{}'", frac); + } + // Right-pad to 6 digits: "500" → 500000, "001" → 1000, "000001" → 1 + for (size_t i = frac.size(); i < 6; ++i) { + val *= 10; + } + return static_cast(val); +} } // namespace std::string TransformUtil::HumanYear(int32_t year_ordinal) { @@ -92,6 +111,87 @@ std::string TransformUtil::HumanTimestampWithZone(int64_t timestamp_micros) { } } +Result TransformUtil::ParseDay(std::string_view str) { + // Expected format: "yyyy-MM-dd" ) + // Parse year, month, day manually + auto dash1 = str.find('-', str[0] == '-' ? 1 : 0); + auto dash2 = str.find('-', dash1 + 1); + if (str.size() < 10 || dash1 == std::string_view::npos || + dash2 == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid date string: '{}'", str); + } + int32_t year = 0, month = 0, day = 0; + auto [_, e1] = std::from_chars(str.data(), str.data() + dash1, year); + auto [__, e2] = std::from_chars(str.data() + dash1 + 1, str.data() + dash2, month); + auto [___, e3] = std::from_chars(str.data() + dash2 + 1, str.data() + str.size(), day); + + if (e1 != std::errc{} || e2 != std::errc{} || e3 != std::errc{}) [[unlikely]] { + return InvalidArgument("Invalid year in date string: '{}'", str); + } + + auto ymd = std::chrono::year{year} / std::chrono::month{static_cast(month)} / + std::chrono::day{static_cast(day)}; + if (!ymd.ok()) [[unlikely]] { + return InvalidArgument("Invalid date: '{}'", str); + } + + auto days = std::chrono::sys_days(ymd) - std::chrono::sys_days(kEpochDate); + return static_cast(days.count()); +} + +Result TransformUtil::ParseTime(std::string_view str) { + int64_t hours = 0, minutes = 0, seconds = 0; + + auto [_, eh] = std::from_chars(str.data(), str.data() + 2, hours); + + auto [__, em] = std::from_chars(str.data() + 3, str.data() + 5, minutes); + + if ((em != std::errc{}) || (eh != std::errc{}) || (str.size()) < 5) [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + + int64_t frac_micros = 0; + if (str.size() > 5) { + auto [_, es] = std::from_chars(str.data() + 6, str.data() + 8, seconds); + if (str[5] != ':' || str.size() < 8 || es != std::errc{}) [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + if (str.size() > 8) { + if (str[8] != '.') [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + ICEBERG_ASSIGN_OR_RAISE(frac_micros, ParseFractionalMicros(str.substr(9))); + } + } + + return hours * 3'600 * kMicrosPerSecond + minutes * 60 * kMicrosPerSecond + + seconds * kMicrosPerSecond + frac_micros; +} + +Result TransformUtil::ParseTimestamp(std::string_view str) { + // Format: "yyyy-MM-ddTHH:mm:ss[.SSS[SSS]]" + auto t_pos = str.find('T'); + if (t_pos == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", str); + } + + ICEBERG_ASSIGN_OR_RAISE(auto days, ParseDay(str.substr(0, t_pos))); + ICEBERG_ASSIGN_OR_RAISE(auto time_micros, ParseTime(str.substr(t_pos + 1))); + + return static_cast(days) * kMicrosPerDay + time_micros; +} + +Result TransformUtil::ParseTimestampWithZone(std::string_view str) { + // Format: same as ParseTimestamp but with "+00:00" suffix + constexpr std::string_view kZoneSuffix = "+00:00"; + if (str.size() < kZoneSuffix.size() || + str.substr(str.size() - kZoneSuffix.size()) != kZoneSuffix) [[unlikely]] { + return InvalidArgument("Invalid timestamptz string (missing '+00:00' suffix): '{}'", + str); + } + return ParseTimestamp(str.substr(0, str.size() - kZoneSuffix.size())); +} + std::string TransformUtil::Base64Encode(std::string_view str_to_encode) { static constexpr std::string_view kBase64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; diff --git a/src/iceberg/util/transform_util.h b/src/iceberg/util/transform_util.h index 7482b0dba..2c2a78bbf 100644 --- a/src/iceberg/util/transform_util.h +++ b/src/iceberg/util/transform_util.h @@ -22,6 +22,7 @@ #include #include "iceberg/iceberg_export.h" +#include "iceberg/result.h" namespace iceberg { @@ -98,6 +99,36 @@ class ICEBERG_EXPORT TransformUtil { /// \return a string representation of this timestamp. static std::string HumanTimestampWithZone(int64_t timestamp_micros); + /// \brief Parses a date string in "yyyy-MM-dd" format into days since epoch. + /// + /// \param str The date string to parse. + /// \return The number of days since 1970-01-01, or an error. + static Result ParseDay(std::string_view str); + + /// \brief Parses a time string into microseconds from midnight. + /// + /// Accepts: "HH:mm", "HH:mm:ss", "HH:mm:ss.SSS", "HH:mm:ss.SSSSSS". + /// + /// \param str The time string to parse. + /// \return The number of microseconds from midnight, or an error. + static Result ParseTime(std::string_view str); + + /// \brief Parses a timestamp string into microseconds since epoch. + /// + /// Accepts: "yyyy-MM-ddTHH:mm:ss", with optional fractional seconds (.SSS or .SSSSSS). + /// + /// \param str The timestamp string to parse. + /// \return The number of microseconds since epoch, or an error. + static Result ParseTimestamp(std::string_view str); + + /// \brief Parses a timestamp-with-zone string into microseconds since epoch. + /// + /// Accepts the same formats as ParseTimestamp, with a "+00:00" suffix. + /// + /// \param str The timestamp string to parse. + /// \return The number of microseconds since epoch, or an error. + static Result ParseTimestampWithZone(std::string_view str); + /// \brief Base64 encode a string static std::string Base64Encode(std::string_view str_to_encode); };