-- Data_Clean virtual cluster
CREATE VCLUSTER IF NOT EXISTS Data_Clean
VCLUSTER_SIZE = XSMALL
VCLUSTER_TYPE = GENERAL
AUTO_SUSPEND_IN_SECOND = 60
AUTO_RESUME = TRUE
COMMENT 'Data_Clean VCLUSTER for test';
-- Use our VCLUSTER
USE VCLUSTER Data_Clean;
-- Create and Use SCHEMA
CREATE SCHEMA IF NOT EXISTS Data_Clean;
USE SCHEMA Data_Clean;
缺失值是数据清洗中常见的问题。它们会导致数据分析结果不准确。可以使用 COALESCE 函数、IFNULL 函数或者 CASE 语句填充默认值,或替换缺失值。在实际项目中,处理缺失值常用于确保关键字段不为空,保证数据的完整性。
实现
-- 使用 COALESCE 填充默认值
SELECT id, sale_date, COALESCE(customer_id, 0) AS customer_id, product_id, quantity, price, COALESCE(total_amount, 0) AS total_amount, region FROM sales_data;
-- 使用 IFNULL 填充默认值
SELECT id, sale_date, IFNULL(customer_id, 0) AS customer_id, product_id, quantity, price, IFNULL(total_amount, 0) AS total_amount, region FROM sales_data;
-- 使用 CASE 语句处理缺失值
SELECT id,
CASE
WHEN sale_date IS NULL THEN '2025-01-01'
ELSE sale_date
END AS sale_date,
customer_id,
product_id,
quantity,
price,
total_amount,
region
FROM sales_data;
-- 将字符串转换为日期
SELECT id, CAST(sale_date AS DATE) AS sale_date,
customer_id, product_id, quantity,
CAST(price AS DECIMAL(10, 2)) AS price,
CAST(total_amount AS DECIMAL(10, 2)) AS total_amount, region
FROM sales_data;
删除空格
说明
在数据清洗过程中,字符串前后的空格会导致数据分析结果的不准确。我们可以使用 TRIM 函数删除空格。在实际项目中,删除空格常用于清理包含多余空格的文本字段。
实现
-- 删除空白值
SELECT id,
TRIM(sale_date) AS sale_date,
customer_id,
product_id,
quantity,
price,
total_amount,
TRIM(region) AS region
FROM sales_data;
-- 使用 DISTINCT 去重
SELECT DISTINCT customer_id, product_id, region FROM sales_data;
-- 使用 ROW_NUMBER() 去重
WITH RowNumCTE AS (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY customer_id, product_id, region ORDER BY id) AS row_num
FROM sales_data
)
SELECT id, sale_date, customer_id, product_id, quantity, price, total_amount, region
FROM RowNumCTE
WHERE row_num = 1;
-- 按区域分组计算总销售额
SELECT region, SUM(total_amount) AS total_sales FROM sales_data GROUP BY region;
-- 按产品分组计算总销售量
SELECT product_id, SUM(quantity) AS total_quantity FROM sales_data GROUP BY product_id;
数据筛选
说明
使用 WHERE 子句筛选出符合特定条件的数据。在实际项目中,数据筛选常用于提取感兴趣的数据子集,如筛选出高价值客户、特定时间段的销售数据等。
实现
-- 筛选出销售金额大于500的记录
SELECT * FROM sales_data WHERE total_amount > 500;
-- 合并产品ID和区域字段
SELECT id, sale_date, customer_id, product_id || '-' || region AS combined_field, quantity, price, total_amountFROM sales_data;
合并数据
说明
使用 UNION 操作将多个结果集合并在一起,形成一个完整的结果集。在实际项目中,合并数据常用于将多个查询结果整合在一起,形成统一的分析数据集。
实现
-- 合并两个结果集
SELECT id, sale_date, customer_id, product_id, quantity, price, total_amount, region FROM sales_data
UNION
SELECT id, sale_date, customer_id, product_id, quantity, price, total_amount, region FROM another_sales_data;