-- 排序
SELECT * FROM vectors ORDER BY feature_vector;
-- 分组
SELECT feature_vector, COUNT(*) FROM vectors GROUP BY feature_vector;
-- 比较
SELECT * FROM vectors WHERE feature_vector = vector(1,2,3);
-- 去重
SELECT DISTINCT feature_vector FROM vectors;
-- 创建向量索引(推荐语法)
CREATE VECTOR INDEX embedding_idx ON TABLE table_name(vector_column)
PROPERTIES(
"distance.function" = "cosine_distance", -- 完整函数名
"scalar.type" = "f32", -- 标准类型名
"M" = 16,
"ef_construction" = 200
);
-- 为已有数据构建索引
BUILD INDEX embedding_idx ON table_name;
-- 查看索引状态
SHOW INDEXES ON table_name;
-- 删除索引
DROP INDEX embedding_idx ON table_name;
索引性能调优
不同参数配置的性能特点:
配置场景
M值
ef_construction
scalar.type
特点
快速检索
8
64
f16
较低的构建时间和内存使用
平衡配置
16
200
f32
平衡的精度和性能
高精度
32
400
f32
更高的检索精度,较高的资源消耗
🚀 主流AI模型适配
常见Embedding模型的向量维度
模型提供商
模型名称
维度
建议配置
OpenAI
text-embedding-3-small
1536
VECTOR(FLOAT, 1536)
OpenAI
text-embedding-3-large
3072
VECTOR(FLOAT, 3072)
Google
Universal Sentence Encoder
512
VECTOR(FLOAT, 512)
百度
ERNIE-Embed
768
VECTOR(FLOAT, 768)
智谱
GLM-Embedding
1024
VECTOR(FLOAT, 1024)
多模态模型支持
模型类型
典型维度
应用场景
CLIP
512/768
图文匹配、多模态检索
BLIP
768
图像问答、视觉理解
Vision Transformer
768/1024
图像分类、特征提取
💡 应用场景设计
文档检索系统
-- 知识库表设计
CREATE TABLE knowledge_base (
doc_id BIGINT PRIMARY KEY,
title STRING,
content STRING,
embedding VECTOR(FLOAT, 1536),
created_at TIMESTAMP,
INDEX doc_embedding_idx (embedding) USING VECTOR PROPERTIES (
"scalar.type" = "f32",
"distance.function" = "cosine_distance"
)
);
-- 语义检索查询(推荐模式)
SELECT doc_id, title FROM knowledge_base
WHERE COSINE_DISTANCE(embedding, ?) < 0.7
ORDER BY doc_id LIMIT 10;
推荐系统
-- 用户特征表
CREATE TABLE user_profiles (
user_id BIGINT PRIMARY KEY,
behavior_vector VECTOR(FLOAT, 128),
preference_vector VECTOR(FLOAT, 64),
last_updated TIMESTAMP,
INDEX behavior_idx (behavior_vector) USING VECTOR PROPERTIES (
"distance.function" = "cosine_distance"
)
);
-- 物品特征表
CREATE TABLE item_features (
item_id BIGINT PRIMARY KEY,
content_vector VECTOR(FLOAT, 256),
category_vector VECTOR(INT, 50),
INDEX content_idx (content_vector) USING VECTOR PROPERTIES (
"distance.function" = "cosine_distance"
)
);
-- 相似用户查找(推荐查询模式)
SELECT target.user_id as similar_user FROM user_profiles source
CROSS JOIN user_profiles target
WHERE source.user_id = ?
AND target.user_id != source.user_id
AND COSINE_DISTANCE(source.behavior_vector, target.behavior_vector) < 0.6
ORDER BY target.user_id LIMIT 20;
图像特征匹配
-- 图像特征库
CREATE TABLE image_features (
image_id BIGINT PRIMARY KEY,
image_path STRING,
visual_features VECTOR(FLOAT, 2048),
semantic_features VECTOR(FLOAT, 512),
upload_time TIMESTAMP,
INDEX visual_idx (visual_features) USING VECTOR PROPERTIES (
"distance.function" = "l2_distance",
"scalar.type" = "f16"
),
INDEX semantic_idx (semantic_features) USING VECTOR PROPERTIES (
"distance.function" = "cosine_distance"
)
);
-- 相似图像检索
SELECT image_id, image_path FROM image_features
WHERE L2_DISTANCE(visual_features, ?) < 100.0
ORDER BY image_id LIMIT 50;
⚠️ 重要避坑指南
🚨 向量查询限制
经过UAT环境实际测试,发现两种不同的错误模式:
1. 维度不匹配错误(预期行为)
-- ❌ 错误示例:使用5维向量查询128维数据
SELECT id FROM embeddings_table
WHERE COSINE_DISTANCE(embedding_128d, vector(0.1, 0.2, 0.3, 0.4, 0.5)) < 0.8;
-- 错误信息:
-- function 'COSINE_DISTANCE' cannot be resolved,
-- expect 'vector shall have the same dimension'
解决方法:确保查询向量与表中向量维度完全一致
2. Generated Column内部错误(当前限制)
-- ❌ 问题查询:在SELECT中返回距离计算
SELECT id, content,
COSINE_DISTANCE(embedding, vector(...)) as distance
FROM embeddings_table
ORDER BY distance LIMIT 10;
-- 可能错误:
-- Generated column 4292967295(generated_field_4292967295) is not filled for VECTOR
当前可用的查询模式:
-- ✅ 推荐:WHERE子句中的向量过滤
SELECT id, content FROM embeddings_table
WHERE COSINE_DISTANCE(embedding, vector(...)) < 0.8
ORDER BY id LIMIT 10;
-- ✅ 可用:简单的向量函数
SELECT COSINE_DISTANCE(embedding, embedding) as self_distance
FROM embeddings_table;
-- ❌ 避免:复杂的SELECT返回距离计算并排序
-- ✅ 推荐:利用索引的过滤查询
SELECT doc_id, title FROM knowledge_base
WHERE COSINE_DISTANCE(embedding, ?) < 0.7
ORDER BY doc_id LIMIT 10;
-- ✅ 可选:分步骤的Top-K检索
-- 步骤1:过滤候选
CREATE VIEW candidates AS
SELECT doc_id, title FROM knowledge_base
WHERE COSINE_DISTANCE(embedding, ?) < 0.8;
-- 步骤2:精确排序(如需要)
SELECT doc_id, title FROM candidates ORDER BY doc_id LIMIT 10;
-- 直接使用内置函数和推荐查询模式
SELECT user_id FROM user_features
WHERE COSINE_DISTANCE(user_vector, target_vector) < 0.8;
迁移收益:
删除自定义UDF代码
利用向量索引加速查询
统一在SQL中处理向量计算
Hive用户迁移路径
如果您在Hive中使用ARRAY存储向量:
-- 原Hive表结构
CREATE TABLE user_embeddings (
user_id BIGINT,
features ARRAY<DOUBLE>
) PARTITIONED BY (dt STRING);
-- 需要复杂的UDF
SELECT user_id, my_cosine_udf(features, target_array) as score
FROM user_embeddings
WHERE size(features) = 128; -- 手动维度检查
迁移到VECTOR类型:
-- 新的表结构
CREATE TABLE user_embeddings_v2 (
user_id BIGINT,
features VECTOR(FLOAT, 128) -- 类型安全
) PARTITIONED BY (dt STRING);
-- 使用内置函数和推荐模式
SELECT user_id FROM user_embeddings_v2
WHERE COSINE_DISTANCE(features, vector(...)) < 0.8; -- 无需手动维度检查
评估现有系统
数据规模评估
-- 统计现有向量数据的规模和维度
SELECT
table_name,
column_name,
COUNT(*) as record_count,
AVG(size(array_column)) as avg_dimension,
MIN(size(array_column)) as min_dimension,
MAX(size(array_column)) as max_dimension,
COUNT(DISTINCT size(array_column)) as dimension_variants
FROM (
SELECT 'user_features' as table_name, 'embedding' as column_name, embedding as array_column FROM user_features
UNION ALL
SELECT 'product_vectors' as table_name, 'features' as column_name, features as array_column FROM product_vectors
-- 添加其他需要评估的表
) all_vectors
GROUP BY table_name, column_name;
UDF复杂度评估
-- 识别当前使用的向量相关UDF
SHOW FUNCTIONS LIKE '*similarity*';
SHOW FUNCTIONS LIKE '*distance*';
SHOW FUNCTIONS LIKE '*cosine*';
SHOW FUNCTIONS LIKE '*euclidean*';
查询频率评估
-- 分析向量计算查询的频率(如果有查询日志)
SELECT
DATE(query_time) as query_date,
COUNT(*) as vector_query_count
FROM query_logs
WHERE query_text LIKE '%cosine%'
OR query_text LIKE '%similarity%'
OR query_text LIKE '%distance%'
GROUP BY DATE(query_time)
ORDER BY query_date DESC
LIMIT 30;
并行构建新表
-- 创建新的向量表
CREATE TABLE embeddings_new (
id BIGINT,
content STRING,
embedding VECTOR(FLOAT, 768),
migrated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP(),
INDEX embedding_idx (embedding) USING VECTOR PROPERTIES (
"distance.function" = "cosine_distance"
)
);
-- 数据迁移
INSERT INTO embeddings_new (id, content, embedding)
SELECT
id,
content,
CASE
WHEN size(embedding_array) = 768
THEN cast(embedding_array as VECTOR(FLOAT, 768))
ELSE NULL
END as embedding
FROM embeddings_old
WHERE size(embedding_array) = 768;
-- 构建索引
BUILD INDEX embedding_idx ON embeddings_new;
验证迁移结果
-- 对比查询结果的一致性(使用推荐查询模式)
WITH old_results AS (
SELECT id FROM embeddings_old
WHERE your_cosine_udf(embedding_array, ?) > 0.8
ORDER BY id LIMIT 10
),
new_results AS (
SELECT id FROM embeddings_new
WHERE COSINE_DISTANCE(embedding, ?) < 0.2 -- 注意距离vs相似度的转换
ORDER BY id LIMIT 10
)
SELECT
COALESCE(o.id, n.id) as id,
CASE WHEN o.id IS NOT NULL THEN 'OLD' ELSE NULL END as in_old,
CASE WHEN n.id IS NOT NULL THEN 'NEW' ELSE NULL END as in_new
FROM old_results o
FULL OUTER JOIN new_results n ON o.id = n.id
ORDER BY id;
🚀 性能优化
查询优化
-- ✅ 推荐:使用阈值过滤提升查询效率
SELECT id, content FROM embeddings_table
WHERE COSINE_DISTANCE(embedding, ?) < 0.8 -- 利用向量索引过滤
ORDER BY id LIMIT 10;
-- ✅ 可选:分步骤处理复杂查询
CREATE VIEW filtered_candidates AS
SELECT id, content FROM embeddings_table
WHERE COSINE_DISTANCE(embedding, ?) < 0.8;
SELECT id, content FROM filtered_candidates
ORDER BY id LIMIT 10;
存储优化
-- 大规模数据使用半精度存储
CREATE TABLE large_embeddings (
id BIGINT,
embedding VECTOR(FLOAT, 1536),
INDEX embedding_idx (embedding) USING VECTOR PROPERTIES (
"scalar.type" = "f16", -- 半精度存储节省内存
"distance.function" = "cosine_distance"
)
);