--Describes New York City For-Hire-Vehicle trips.
desc clickzetta_sample_data.nyc_taxi_tripdata.fhvhv_tripdata;
预览
2.2 预览数据明细
--Sample Of Trip Record Data
select * from clickzetta_sample_data.nyc_taxi_tripdata.fhvhv_tripdata limit 10;
预览
2.3 查看数据集的记录数
--1.49 billion rows
select count(*) from clickzetta_sample_data.nyc_taxi_tripdata.fhvhv_tripdata;
Step02 使用通用型集群清洗转换数据
1.指定使用ETL_VC进行数据加工,同时创建目标表所在schema
--要和执行的SQL一起选中执行才会生效
use vcluster ETL_VC;
create schema tutorial;
use tutorial;
2.通过CTAS对原始数据集进行清洗转换写入新表
--2.对原始数据集进行清洗转换
CREATE table tutorial.int_fhvhv_tripdata
as
SELECT
hvfhs_license_num,
CASE
WHEN hvfhs_license_num = 'HV0002' THEN 'juno'
WHEN hvfhs_license_num = 'HV0003' THEN 'uber'
WHEN hvfhs_license_num = 'HV0004' THEN 'via'
WHEN hvfhs_license_num = 'HV0005' THEN 'lyft'
ELSE null
END AS company,
ltrim(rtrim(upper(dispatching_base_num))) dispatching_base_num,
ltrim(rtrim(upper(originating_base_num))) originating_base_num,
request_datetime,
on_scene_datetime,
pickup_datetime,
dropoff_datetime,
PULocationID,
DOLocationID,
trip_miles,
trip_time,
base_passenger_fare,
tolls,
bcf,
sales_tax,
congestion_surcharge,
airport_fee,
tips,
driver_pay,
CASE
WHEN shared_request_flag = 'Y' THEN true
WHEN shared_request_flag IN ('N', ' ') THEN false
ELSE null
END AS shared_request_flag,
CASE
WHEN shared_match_flag = 'Y' THEN true
WHEN shared_match_flag IN ('N', ' ') THEN false
ELSE null
END AS shared_match_flag,
CASE
WHEN access_a_ride_flag = 'Y' THEN true
WHEN access_a_ride_flag IN ('N', ' ') THEN false
ELSE null
END AS access_a_ride_flag,
CASE
WHEN wav_request_flag = 'Y' THEN true
WHEN wav_request_flag IN ('N', ' ') THEN false
ELSE null
END AS wav_request_flag,
CASE
WHEN wav_match_flag = 'Y' THEN true
WHEN wav_match_flag IN ('N', ' ') THEN false
ELSE null
END AS wav_match_flag
FROM clickzetta_sample_data.nyc_taxi_tripdata.fhvhv_tripdata;
验证加工后的数据
SELECT * FROM tutorial.int_fhvhv_tripdata LIMIT 10;
预览
3.对清洗后的数据按照分析主题分别进行聚合计算,生成用于分析的数据表
--Scenario 1: Analyze taxi trip patterns by time of day
CREATE table tutorial.mart_trips_pattern_by_time
AS
SELECT
EXTRACT(HOUR FROM pickup_datetime) AS hour,
COUNT(*) AS trip_count
FROM tutorial.int_fhvhv_tripdata
GROUP BY hour;
--Scenario 2: Analyze taxi trip patterns by day of the week
CREATE table tutorial.mart_trips_pattern_by_dayofweek
AS
SELECT
EXTRACT(DAY FROM pickup_datetime) AS day_of_week,
COUNT(*) AS trip_count
FROM tutorial.int_fhvhv_tripdata
GROUP BY day_of_week;
--Scenario 3: Analyze taxi trip patterns by pickup location
CREATE table tutorial.mart_trips_pattern_by_pickup_location
AS
SELECT
PULocationID,
COUNT(*) AS trip_count
FROM tutorial.int_fhvhv_tripdata
GROUP BY PULocationID;
--Scenario 4: Analyze taxi trip patterns by dropoff location
CREATE table tutorial.mart_trips_pattern_by_dropoff_location
AS
SELECT
DOLocationID,
COUNT(*) AS trip_count
FROM tutorial.int_fhvhv_tripdata
GROUP BY DOLocationID;
--Scenario 5:Trips per day
CREATE table tutorial.mart_trips_per_day
AS
SELECT
pickup_datetime::date AS date,
sum(trip_miles) AS trip_miles
FROM tutorial.int_fhvhv_tripdata
GROUP BY date;
--Scenario 6:Total driver pay per company
CREATE table tutorial.mart_trips_driver_pay_per_company
AS
SELECT
CONCAT(YEAR(pickup_datetime), '-', MONTH(pickup_datetime)) AS year_month,
company,
sum(driver_pay) AS driver_pay
FROM tutorial.int_fhvhv_tripdata
GROUP BY year_month,company;
检查数据对象是否创建成功。
--检查新创建数据模型的状态
show tables in tutorial;
预览
--检查新创建数据模型的数据
SELECT * FROM tutorial.mart_trips_driver_pay_per_company
WHERE substr(year_month,0,4)='2021'
ORDER BY year_month ASC;
预览
Step03 使用分析型集群单并发查询
1.切换当前Session使用的虚拟集群为REPORTING_VC
-- 1.使用分析型VC进行加速查询分析,要和执行的SQL一起选中执行才会生效
USE VCLUSTER REPORTING_VC;
--设置查询作业标签,以便检索过滤,要和执行的SQL一起选中执行才会生效
SET QUERY_TAG = 'Tutorial02';
2.串行执行6个业务分析查询
--Scenario 1: Analyze taxi trip patterns by time of day
SELECT * FROM tutorial.mart_trips_pattern_by_time
ORDER BY HOUR ASC;
--Scenario 2: Analyze taxi trip patterns by day of the week
SELECT * FROM tutorial.mart_trips_pattern_by_dayofweek
ORDER BY day_of_week ASC;
--Scenario 3: Analyze taxi trip patterns by pickup location
SELECT * FROM tutorial.mart_trips_pattern_by_pickup_location
ORDER BY trip_count DESC
LIMIT 10;
--Scenario 4: Analyze taxi trip patterns by dropoff location
SELECT * FROM tutorial.mart_trips_pattern_by_dropoff_location
ORDER BY trip_count DESC
LIMIT 10;
--Scenario 5:Trips per day
SELECT * FROM tutorial.mart_trips_per_day
WHERE CONCAT(YEAR(date) , MONTH(date)) = '202110'
ORDER BY date;
--Scenario 6:Total driver pay per company
SELECT * FROM tutorial.mart_trips_driver_pay_per_company
WHERE substr(year_month,0,4)='2021'
ORDER BY year_month ASC;
3.观察查询的Latency结果
--清除QUERY_TAG
SET QUERY_TAG = '';
--查看运行的作业执行结果
SHOW JOBS WHERE QUERY_TAG='Tutorial02' LIMIT 10;