-- 排序
SELECT * FROM vectors ORDER BY feature_vector;
-- 分组
SELECT feature_vector, COUNT(*) FROM vectors GROUP BY feature_vector;
-- 比较
SELECT * FROM vectors WHERE feature_vector = vector(1,2,3);
-- 去重
SELECT DISTINCT feature_vector FROM vectors;
-- 创建向量索引(推荐语法)
CREATE VECTOR INDEX embedding_idx ON TABLE table_name(vector_column)
PROPERTIES(
"distance.function" = "cosine_distance", -- 完整函数名
"scalar.type" = "f32", -- 标准类型名
"M" = 16,
"ef_construction" = 200
);
-- 为已有数据构建索引
BUILD INDEX embedding_idx ON table_name;
-- 查看索引状态
SHOW INDEXES ON table_name;
-- 删除索引
DROP INDEX embedding_idx ON table_name;
索引性能调优
不同参数配置的性能特点:
配置场景
M值
ef_construction
scalar.type
特点
快速检索
8
64
f16
较低的构建时间和内存使用
平衡配置
16
200
f32
平衡的精度和性能
高精度
32
400
f32
更高的检索精度,较高的资源消耗
主流AI模型适配
常见Embedding模型的向量维度
模型提供商
模型名称
维度
建议配置
OpenAI
text-embedding-3-small
1536
VECTOR(FLOAT, 1536)
OpenAI
text-embedding-3-large
3072
VECTOR(FLOAT, 3072)
Google
Universal Sentence Encoder
512
VECTOR(FLOAT, 512)
百度
ERNIE-Embed
768
VECTOR(FLOAT, 768)
智谱
GLM-Embedding
1024
VECTOR(FLOAT, 1024)
多模态模型支持
模型类型
典型维度
应用场景
CLIP
512/768
图文匹配、多模态检索
BLIP
768
图像问答、视觉理解
Vision Transformer
768/1024
图像分类、特征提取
应用场景设计
文档检索系统
-- 知识库表设计
CREATE TABLE knowledge_base (
doc_id BIGINT PRIMARY KEY,
title STRING,
content STRING,
embedding VECTOR(FLOAT, 1536),
created_at TIMESTAMP,
INDEX doc_embedding_idx (embedding) USING VECTOR PROPERTIES (
"scalar.type" = "f32",
"distance.function" = "cosine_distance"
)
);
-- 语义检索查询(推荐模式)
SELECT doc_id, title FROM knowledge_base
WHERE COSINE_DISTANCE(embedding, ?) < 0.7
ORDER BY doc_id LIMIT 10;
推荐系统
-- 用户特征表
CREATE TABLE user_profiles (
user_id BIGINT PRIMARY KEY,
behavior_vector VECTOR(FLOAT, 128),
preference_vector VECTOR(FLOAT, 64),
last_updated TIMESTAMP,
INDEX behavior_idx (behavior_vector) USING VECTOR PROPERTIES (
"distance.function" = "cosine_distance"
)
);
-- 物品特征表
CREATE TABLE item_features (
item_id BIGINT PRIMARY KEY,
content_vector VECTOR(FLOAT, 256),
category_vector VECTOR(INT, 50),
INDEX content_idx (content_vector) USING VECTOR PROPERTIES (
"distance.function" = "cosine_distance"
)
);
-- 相似用户查找(推荐查询模式)
SELECT target.user_id as similar_user FROM user_profiles source
CROSS JOIN user_profiles target
WHERE source.user_id = ?
AND target.user_id != source.user_id
AND COSINE_DISTANCE(source.behavior_vector, target.behavior_vector) < 0.6
ORDER BY target.user_id LIMIT 20;
图像特征匹配
-- 图像特征库
CREATE TABLE image_features (
image_id BIGINT PRIMARY KEY,
image_path STRING,
visual_features VECTOR(FLOAT, 2048),
semantic_features VECTOR(FLOAT, 512),
upload_time TIMESTAMP,
INDEX visual_idx (visual_features) USING VECTOR PROPERTIES (
"distance.function" = "l2_distance",
"scalar.type" = "f16"
),
INDEX semantic_idx (semantic_features) USING VECTOR PROPERTIES (
"distance.function" = "cosine_distance"
)
);
-- 相似图像检索
SELECT image_id, image_path FROM image_features
WHERE L2_DISTANCE(visual_features, ?) < 100.0
ORDER BY image_id LIMIT 50;
️ 重要使用注意事项
向量查询限制
经过UAT环境实际测试,发现两种不同的错误模式:
1. 维度不匹配错误(预期行为)
-- ❌ 错误示例:使用5维向量查询128维数据
SELECT id FROM embeddings_table
WHERE COSINE_DISTANCE(embedding_128d, vector(0.1, 0.2, 0.3, 0.4, 0.5)) < 0.8;
-- 错误信息:
-- function 'COSINE_DISTANCE' cannot be resolved,
-- expect 'vector shall have the same dimension'
解决方法:确保查询向量与表中向量维度完全一致
2. Generated Column内部错误(当前限制)
-- ❌ 问题查询:在SELECT中返回距离计算
SELECT id, content,
COSINE_DISTANCE(embedding, vector(...)) as distance
FROM embeddings_table
ORDER BY distance LIMIT 10;
-- 可能错误:
-- Generated column 4292967295(generated_field_4292967295) is not filled for VECTOR
当前可用的查询模式:
-- ✅ 推荐:WHERE子句中的向量过滤
SELECT id, content FROM embeddings_table
WHERE COSINE_DISTANCE(embedding, vector(...)) < 0.8
ORDER BY id LIMIT 10;
-- ✅ 可用:简单的向量函数
SELECT COSINE_DISTANCE(embedding, embedding) as self_distance
FROM embeddings_table;
-- ❌ 避免:复杂的SELECT返回距离计算并排序
-- ✅ 推荐:利用索引的过滤查询
SELECT doc_id, title FROM knowledge_base
WHERE COSINE_DISTANCE(embedding, ?) < 0.7
ORDER BY doc_id LIMIT 10;
-- ✅ 可选:分步骤的Top-K检索
-- 步骤1:过滤候选
CREATE VIEW candidates AS
SELECT doc_id, title FROM knowledge_base
WHERE COSINE_DISTANCE(embedding, ?) < 0.8;
-- 步骤2:精确排序(如需要)
SELECT doc_id, title FROM candidates ORDER BY doc_id LIMIT 10;
迁移策略
迁移价值评估
在决定是否迁移到VECTOR类型之前,建议先评估你的使用场景:
高价值迁移场景
频繁的向量相似度计算:如推荐系统、文档检索、图像匹配
大规模向量数据:超过百万级别的向量数据查询
复杂的UDF维护:当前需要维护多个自定义向量计算函数
性能要求高:对查询响应时间有严格要求的应用
可选迁移场景
偶尔的向量计算:向量计算不是核心业务流程
小规模数据:向量数据量较小,性能压力不大
稳定的现有系统:当前系统运行稳定,改动风险大于收益
针对不同用户的迁移建议
Spark用户迁移路径
如果你主要使用Spark MLlib进行向量计算:
原Spark代码:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
自定义相似度函数:
def cosine_similarity(v1, v2):
# 复杂的实现逻辑
pass
注册UDF:
cosine_udf = udf(cosine_similarity, DoubleType())
迁移到CloudZetta VECTOR后:
-- 直接使用内置函数和推荐查询模式
SELECT user_id FROM user_features
WHERE COSINE_DISTANCE(user_vector, target_vector) < 0.8;
迁移收益:
删除自定义UDF代码
利用向量索引加速查询
统一在SQL中处理向量计算
Hive用户迁移路径
如果你在Hive中使用ARRAY存储向量:
-- 原Hive表结构
CREATE TABLE user_embeddings (
user_id BIGINT,
features ARRAY<DOUBLE>
) PARTITIONED BY (dt STRING);
-- 需要复杂的UDF
SELECT user_id, my_cosine_udf(features, target_array) as score
FROM user_embeddings
WHERE size(features) = 128; -- 手动维度检查
迁移到 VECTOR 类型:
-- 新的表结构
CREATE TABLE user_embeddings_v2 (
user_id BIGINT,
features VECTOR(FLOAT, 128) -- 类型安全
) PARTITIONED BY (dt STRING);
-- 使用内置函数和推荐模式
SELECT user_id FROM user_embeddings_v2
WHERE COSINE_DISTANCE(features, vector(...)) < 0.8; -- 无需手动维度检查
评估现有系统
数据规模评估
-- 统计现有向量数据的规模和维度
SELECT
table_name,
column_name,
COUNT(*) as record_count,
AVG(size(array_column)) as avg_dimension,
MIN(size(array_column)) as min_dimension,
MAX(size(array_column)) as max_dimension,
COUNT(DISTINCT size(array_column)) as dimension_variants
FROM (
SELECT 'user_features' as table_name, 'embedding' as column_name, embedding as array_column FROM user_features
UNION ALL
SELECT 'product_vectors' as table_name, 'features' as column_name, features as array_column FROM product_vectors
-- 添加其他需要评估的表
) all_vectors
GROUP BY table_name, column_name;
UDF复杂度评估
-- 识别当前使用的向量相关UDF
SHOW FUNCTIONS LIKE '*similarity*';
SHOW FUNCTIONS LIKE '*distance*';
SHOW FUNCTIONS LIKE '*cosine*';
SHOW FUNCTIONS LIKE '*euclidean*';
查询频率评估
-- 分析向量计算查询的频率(如果有查询日志)
SELECT
DATE(query_time) as query_date,
COUNT(*) as vector_query_count
FROM query_logs
WHERE query_text LIKE '%cosine%'
OR query_text LIKE '%similarity%'
OR query_text LIKE '%distance%'
GROUP BY DATE(query_time)
ORDER BY query_date DESC
LIMIT 30;
并行构建新表
-- 创建新的向量表
CREATE TABLE embeddings_new (
id BIGINT,
content STRING,
embedding VECTOR(FLOAT, 768),
migrated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP(),
INDEX embedding_idx (embedding) USING VECTOR PROPERTIES (
"distance.function" = "cosine_distance"
)
);
-- 数据迁移
INSERT INTO embeddings_new (id, content, embedding)
SELECT
id,
content,
CASE
WHEN size(embedding_array) = 768
THEN cast(embedding_array as VECTOR(FLOAT, 768))
ELSE NULL
END as embedding
FROM embeddings_old
WHERE size(embedding_array) = 768;
-- 构建索引
BUILD INDEX embedding_idx ON embeddings_new;
验证迁移结果
-- 对比查询结果的一致性(使用推荐查询模式)
WITH old_results AS (
SELECT id FROM embeddings_old
WHERE your_cosine_udf(embedding_array, ?) > 0.8
ORDER BY id LIMIT 10
),
new_results AS (
SELECT id FROM embeddings_new
WHERE COSINE_DISTANCE(embedding, ?) < 0.2 -- 注意距离vs相似度的转换
ORDER BY id LIMIT 10
)
SELECT
COALESCE(o.id, n.id) as id,
CASE WHEN o.id IS NOT NULL THEN 'OLD' ELSE NULL END as in_old,
CASE WHEN n.id IS NOT NULL THEN 'NEW' ELSE NULL END as in_new
FROM old_results o
FULL OUTER JOIN new_results n ON o.id = n.id
ORDER BY id;
性能优化
查询优化
-- ✅ 推荐:使用阈值过滤提升查询效率
SELECT id, content FROM embeddings_table
WHERE COSINE_DISTANCE(embedding, ?) < 0.8 -- 利用向量索引过滤
ORDER BY id LIMIT 10;
-- ✅ 可选:分步骤处理复杂查询
CREATE VIEW filtered_candidates AS
SELECT id, content FROM embeddings_table
WHERE COSINE_DISTANCE(embedding, ?) < 0.8;
SELECT id, content FROM filtered_candidates
ORDER BY id LIMIT 10;
存储优化
-- 大规模数据使用半精度存储
CREATE TABLE large_embeddings (
id BIGINT,
embedding VECTOR(FLOAT, 1536),
INDEX embedding_idx (embedding) USING VECTOR PROPERTIES (
"scalar.type" = "f16", -- 半精度存储节省内存
"distance.function" = "cosine_distance"
)
);