diff --git a/mariadb-plugin-columnstore.install.generated b/mariadb-plugin-columnstore.install.generated index d987525f2a671..2b1434e58b9a1 100644 --- a/mariadb-plugin-columnstore.install.generated +++ b/mariadb-plugin-columnstore.install.generated @@ -1 +1,3 @@ #File is generated by ColumnstoreLibrary.cmake, do not edit +etc/mysql/columnstore.cnf # added in dbcon/mysql/CMakeLists.txt +usr/local/mysql/lib/plugin/ha_columnstore.so # added in dbcon/mysql/CMakeLists.txt diff --git a/mysql-test/main/mdev_35327.result b/mysql-test/main/mdev_35327.result new file mode 100644 index 0000000000000..3100a51ba08dd --- /dev/null +++ b/mysql-test/main/mdev_35327.result @@ -0,0 +1,70 @@ +# +# MDEV-35327: Add VEC_DISTANCE_MANHATTAN function +# +# +# Checking for argument validity +# +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[1,2]')); +ERROR 42000: Incorrect parameter count in the call to native function 'VEC_DISTANCE_MANHATTAN' +SELECT VEC_DISTANCE_MANHATTAN(NULL, VEC_FromText('[1,2]')); +VEC_DISTANCE_MANHATTAN(NULL, VEC_FromText('[1,2]')) +NULL +# Checking for mismatched dimensions +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[1,1,1]'),VEC_FromText('[1,2]')); +VEC_DISTANCE_MANHATTAN(VEC_FromText('[1,1,1]'),VEC_FromText('[1,2]')) +NULL +# +# Basic math check +# +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[1,2,3]'), VEC_FromText('[2,3,4]')); +VEC_DISTANCE_MANHATTAN(VEC_FromText('[1,2,3]'), VEC_FromText('[2,3,4]')) +3 +# +# Without Vector Index +# +CREATE TABLE t1 (id INT, v VECTOR(3) NOT NULL); +INSERT INTO t1 VALUES (1, VEC_FromText('[2,2,2]')), (2, VEC_FromText('[0,0,5]')), (3, VEC_FromText('[1,1,1]')); +# Manhattan distance:- 6,5,3 Euclidean distance:- 3.46,5,1.73 +# Manhattan | Euclidean +# P3 P3 +# P2 P1 +# P1 P2 +# output should be 3,5,6 and ordering should be P3 < P2 < P1 +SELECT id, VEC_DISTANCE_MANHATTAN(v, VEC_FromText('[0,0,0]')) as dist FROM t1 ORDER BY dist; +id dist +3 3 +2 5 +1 6 +# Comparison with Euclidean distance +SELECT id, VEC_DISTANCE_EUCLIDEAN(v, VEC_FromText('[0,0,0]')) as dist FROM t1 ORDER BY dist; +id dist +3 1.7320508075688772 +1 3.4641016151377544 +2 5 +# +# With Vector Index +# +CREATE VECTOR INDEX idx ON t1(v) DISTANCE=manhattan; +# Output should be 3,5 and 6 again +SELECT id, VEC_DISTANCE_MANHATTAN(v, VEC_FromText('[0,0,0]')) as dist FROM t1 ORDER BY dist LIMIT 3; +id dist +3 3 +2 5 +1 6 +# Checking if the vector index is actually implemented using manhattan distance +EXPLAIN SELECT id FROM t1 FORCE INDEX (idx) +ORDER BY VEC_DISTANCE_MANHATTAN(v, VEC_FromText('[0,0,0]')) LIMIT 1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL idx 14 NULL 1 +# Cleanup +DROP TABLE t1; +# Miscellaneous Tests +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[-1,-1]'), VEC_FromText('[1,1]')) as neg_test; +neg_test +4 +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[1.5, 2.5]'), VEC_FromText('[1.5, 2.5]')) as zero_dist; +zero_dist +0 +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[1.1]'), VEC_FromText('[2.2]')) as float_test; +float_test +1.100000023841858 diff --git a/mysql-test/main/mdev_35327.test b/mysql-test/main/mdev_35327.test new file mode 100644 index 0000000000000..e3132ac9b5d4f --- /dev/null +++ b/mysql-test/main/mdev_35327.test @@ -0,0 +1,58 @@ +--echo # +--echo # MDEV-35327: Add VEC_DISTANCE_MANHATTAN function +--echo # + +--echo # +--echo # Checking for argument validity +--echo # +--error ER_WRONG_PARAMCOUNT_TO_NATIVE_FCT +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[1,2]')); +SELECT VEC_DISTANCE_MANHATTAN(NULL, VEC_FromText('[1,2]')); +--echo # Checking for mismatched dimensions +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[1,1,1]'),VEC_FromText('[1,2]')); + +--echo # +--echo # Basic math check +--echo # +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[1,2,3]'), VEC_FromText('[2,3,4]')); + + +--echo # +--echo # Without Vector Index +--echo # +CREATE TABLE t1 (id INT, v VECTOR(3) NOT NULL); +INSERT INTO t1 VALUES (1, VEC_FromText('[2,2,2]')), (2, VEC_FromText('[0,0,5]')), (3, VEC_FromText('[1,1,1]')); + +--echo # Manhattan distance:- 6,5,3 Euclidean distance:- 3.46,5,1.73 +--echo # Manhattan | Euclidean +--echo # P3 P3 +--echo # P2 P1 +--echo # P1 P2 +--echo # output should be 3,5,6 and ordering should be P3 < P2 < P1 + +SELECT id, VEC_DISTANCE_MANHATTAN(v, VEC_FromText('[0,0,0]')) as dist FROM t1 ORDER BY dist; +--echo # Comparison with Euclidean distance +SELECT id, VEC_DISTANCE_EUCLIDEAN(v, VEC_FromText('[0,0,0]')) as dist FROM t1 ORDER BY dist; + +--echo # +--echo # With Vector Index +--echo # +CREATE VECTOR INDEX idx ON t1(v) DISTANCE=manhattan; + +--echo # Output should be 3,5 and 6 again +SELECT id, VEC_DISTANCE_MANHATTAN(v, VEC_FromText('[0,0,0]')) as dist FROM t1 ORDER BY dist LIMIT 3; + +--echo # Checking if the vector index is actually implemented using manhattan distance +EXPLAIN SELECT id FROM t1 FORCE INDEX (idx) +ORDER BY VEC_DISTANCE_MANHATTAN(v, VEC_FromText('[0,0,0]')) LIMIT 1; + +--echo # Cleanup +DROP TABLE t1; + +--echo # Miscellaneous Tests + +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[-1,-1]'), VEC_FromText('[1,1]')) as neg_test; + +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[1.5, 2.5]'), VEC_FromText('[1.5, 2.5]')) as zero_dist; + +SELECT VEC_DISTANCE_MANHATTAN(VEC_FromText('[1.1]'), VEC_FromText('[2.2]')) as float_test; diff --git a/mysql-test/main/mysqld--help.result b/mysql-test/main/mysqld--help.result index cb87a7ba8f1e6..7b16b9d02687c 100644 --- a/mysql-test/main/mysqld--help.result +++ b/mysql-test/main/mysqld--help.result @@ -799,7 +799,7 @@ The following specify which files/extra groups are read (specified before remain Supported MDL namespaces: BACKUP --mhnsw-default-distance=name Distance function to build the vector index for. One of: - euclidean, cosine + euclidean, cosine, manhattan --mhnsw-default-m=# Larger values mean slower SELECTs and INSERTs, larger index size and higher memory consumption but more accurate results diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result index 3344eea6148c3..7cd25cbdbe00a 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result @@ -2229,7 +2229,7 @@ VARIABLE_COMMENT Distance function to build the vector index for NUMERIC_MIN_VALUE NULL NUMERIC_MAX_VALUE NULL NUMERIC_BLOCK_SIZE NULL -ENUM_VALUE_LIST euclidean,cosine +ENUM_VALUE_LIST euclidean,cosine,manhattan READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME MHNSW_DEFAULT_M diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result index 95d90b797e0bf..2f071973bb822 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result @@ -2469,7 +2469,7 @@ VARIABLE_COMMENT Distance function to build the vector index for NUMERIC_MIN_VALUE NULL NUMERIC_MAX_VALUE NULL NUMERIC_BLOCK_SIZE NULL -ENUM_VALUE_LIST euclidean,cosine +ENUM_VALUE_LIST euclidean,cosine,manhattan READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME MHNSW_DEFAULT_M diff --git a/sql/item_create.cc b/sql/item_create.cc index f707607e1e84a..13bda3165bcb2 100644 --- a/sql/item_create.cc +++ b/sql/item_create.cc @@ -6237,6 +6237,24 @@ class Create_func_vec_distance_cosine: public Create_func_arg2 Create_func_vec_distance_cosine Create_func_vec_distance_cosine::s_singleton; + +class Create_func_vec_distance_manhattan: public Create_func_arg2 +{ +public: + Item *create_2_arg(THD *thd, Item *arg1, Item *arg2) override + { return new (thd->mem_root) + Item_func_vec_distance(thd, arg1, arg2, Item_func_vec_distance::MANHATTAN); } + + static Create_func_vec_distance_manhattan s_singleton; + +protected: + Create_func_vec_distance_manhattan() = default; + virtual ~Create_func_vec_distance_manhattan() = default; +}; + +Create_func_vec_distance_manhattan Create_func_vec_distance_manhattan::s_singleton; + + class Create_func_vec_distance: public Create_func_arg2 { public: @@ -6251,6 +6269,7 @@ class Create_func_vec_distance: public Create_func_arg2 virtual ~Create_func_vec_distance() = default; }; + Create_func_vec_distance Create_func_vec_distance::s_singleton; class Create_func_vec_totext: public Create_func_arg1 @@ -6516,6 +6535,7 @@ const Native_func_registry func_array[] = { { STRING_WITH_LEN("UUID_SHORT") }, BUILDER(Create_func_uuid_short)}, { { STRING_WITH_LEN("VEC_DISTANCE_EUCLIDEAN") }, BUILDER(Create_func_vec_distance_euclidean)}, { { STRING_WITH_LEN("VEC_DISTANCE_COSINE") }, BUILDER(Create_func_vec_distance_cosine)}, + { { STRING_WITH_LEN("VEC_DISTANCE_MANHATTAN") }, BUILDER(Create_func_vec_distance_manhattan)}, { { STRING_WITH_LEN("VEC_DISTANCE") }, BUILDER(Create_func_vec_distance)}, { { STRING_WITH_LEN("VEC_FROMTEXT") }, BUILDER(Create_func_vec_fromtext)}, { { STRING_WITH_LEN("VEC_TOTEXT") }, BUILDER(Create_func_vec_totext)}, diff --git a/sql/item_vectorfunc.cc b/sql/item_vectorfunc.cc index 354405c3b5fe2..6a716817d9691 100644 --- a/sql/item_vectorfunc.cc +++ b/sql/item_vectorfunc.cc @@ -48,6 +48,17 @@ static double calc_distance_cosine(float *v1, float *v2, size_t v_len) return 1 - dotp/sqrt(abs1*abs2); } +static double calc_distance_manhattan(float *v1, float *v2, size_t v_len) +{ + double d= 0; + for (size_t i= 0; i < v_len; i++, v1++, v2++) + { + double dist= abs(get_float(v1) - get_float(v2)); + d+= dist; + } + return d; +} + Item_func_vec_distance::Item_func_vec_distance(THD *thd, Item *a, Item *b, distance_kind kind) :Item_real_func(thd, a, b), kind(kind) @@ -59,6 +70,7 @@ bool Item_func_vec_distance::fix_length_and_dec(THD *thd) switch (kind) { case EUCLIDEAN: calc_distance= calc_distance_euclidean; break; case COSINE: calc_distance= calc_distance_cosine; break; + case MANHATTAN: calc_distance= calc_distance_manhattan; break; case AUTO: for (uint i=0; i < 2; i++) if (auto *item= dynamic_cast(args[i]->real_item())) @@ -90,10 +102,12 @@ key_map Item_func_vec_distance::part_of_sortkey() const Field *f= item->field; KEY *keyinfo= f->table->s->key_info; for (uint i= f->table->s->keys; i < f->table->s->total_keys; i++) + { if (!keyinfo[i].is_ignored && keyinfo[i].algorithm == HA_KEY_ALG_VECTOR && f->key_start.is_set(i) && mhnsw_uses_distance(f->table, keyinfo + i) == kind) map.set_bit(i); + } } return map; } diff --git a/sql/item_vectorfunc.h b/sql/item_vectorfunc.h index bcff3daa7dfb2..72f2c76d227f9 100644 --- a/sql/item_vectorfunc.h +++ b/sql/item_vectorfunc.h @@ -39,13 +39,14 @@ class Item_func_vec_distance: public Item_real_func double (*calc_distance)(float *v1, float *v2, size_t v_len); public: - enum distance_kind { EUCLIDEAN, COSINE, AUTO } kind; + enum distance_kind { EUCLIDEAN, COSINE, MANHATTAN, AUTO } kind; Item_func_vec_distance(THD *thd, Item *a, Item *b, distance_kind kind); LEX_CSTRING func_name_cstring() const override { - static LEX_CSTRING name[3]= { + static LEX_CSTRING name[4]= { { STRING_WITH_LEN("VEC_DISTANCE_EUCLIDEAN") }, { STRING_WITH_LEN("VEC_DISTANCE_COSINE") }, + { STRING_WITH_LEN("VEC_DISTANCE_MANHATTAN")}, { STRING_WITH_LEN("VEC_DISTANCE") } }; return name[kind]; diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc index d640363b6e76a..add99f84b5373 100644 --- a/sql/vector_mhnsw.cc +++ b/sql/vector_mhnsw.cc @@ -104,8 +104,8 @@ static MYSQL_THDVAR_UINT(default_m, PLUGIN_VAR_RQCMDARG, "and higher memory consumption but more accurate results", nullptr, nullptr, 6, 3, 200, 1); -enum metric_type : uint { EUCLIDEAN, COSINE }; -static const char *distance_names[]= { "euclidean", "cosine", nullptr }; +enum metric_type : uint { EUCLIDEAN, COSINE, MANHATTAN }; +static const char *distance_names[]= { "euclidean", "cosine", "manhattan", nullptr }; static TYPELIB distances= CREATE_TYPELIB_FOR(distance_names); static MYSQL_THDVAR_ENUM(default_distance, PLUGIN_VAR_RQCMDARG, "Distance function to build the vector index for", @@ -1749,9 +1749,16 @@ const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length) Item_func_vec_distance::distance_kind mhnsw_uses_distance(const TABLE *table, KEY *keyinfo) { - if (keyinfo->option_struct->metric == EUCLIDEAN) - return Item_func_vec_distance::EUCLIDEAN; - return Item_func_vec_distance::COSINE; + switch (keyinfo->option_struct->metric) { + case EUCLIDEAN: + return Item_func_vec_distance::EUCLIDEAN; + case MANHATTAN: + return Item_func_vec_distance::MANHATTAN; + case COSINE: + return Item_func_vec_distance::COSINE; + default: + return Item_func_vec_distance::COSINE; + } } /* diff --git a/storage/connect/connect_jars/JdbcInterface.jar b/storage/connect/connect_jars/JdbcInterface.jar new file mode 100644 index 0000000000000..548d821cb789b Binary files /dev/null and b/storage/connect/connect_jars/JdbcInterface.jar differ