功能描述
将表或查询结果导出到指定路径。您可以将一张表或查询结果导出为文件并保存到 Volume 的指定路径。请注意,如果指定路径下已存在同名文件,则该文件将会被覆盖。
注意事项
- 默认文件名:Lakehouse 导出时,默认文件名是相同的。因此,如果您多次运行导出操作并且指定了相同的子目录(SUBDIRECTORY),新文件将覆盖上次导出的文件。
- 避免覆盖:为了防止意外覆盖现有文件,请确保每次导出时使用唯一的文件名或子目录。
语法
---语法
COPY INTO { VOLUME external_volume_name | TABLE VOLUME table_name | USER VOLUME }
SUBDIRECTORY '<path>'
FROM { [schema.]<table_name> | ( <query> ) }
FILE_FORMAT = ( TYPE = { CSV | PARQUET | JSON } [ formatTypeOptions ] )
[ copyOptions ]
参数说明
--直接指定表
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
--将SQL结果集导出
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM (select * from birds limit 1)
file_format = (type = CSV);
-
formatTypeOptions:文件格式,支持CSV、TEXT、PARQUET、JSON。其中JSON导出的格式是JSON LINE
-
CSV 格式支持的参数:
- sep:列分隔符,默认为 ”,“。最多支持长度为 1 的字符,例如:
'sep'=','
或者sep = '\001'
:使用 Hive 默认的字段分隔符(不可见字符)
- compression:配置文件压缩格式。支持的压缩格式有:gzip/zstd/zlib。例如:
compression='gzip'
- lineSep:行分隔符,默认值为"\n"。最多支持长度为 2 的字符,例如:
lineSep='$'
- quote:用于转义包含特殊字符的字段。默认值为双引号“"“,例如:*
quote = '"'
:使用双引号作为转义字符。 quote = '\''
:使用单引号作为转义字符。
- header:是否解析表头,默认为 false。布尔类型,例如:
header='true'
- timeZone:配置时区,没有默认值。用于指定文件中时间格式的时区。例如:
timeZone = 'Asia/Shanghai'
- escape:用于转义已加引号的值中的特殊字符,仅支持单字节字符。默认值为”\“,例如:
'escape'=r'\'
。转义字符用于解释数据文件中字段分隔符或换行符的特殊含义。如果某一行以反斜杠结尾,则该字符会转义为文件格式选项中指定的换行符或回车符(lineSep
)。因此,加载操作会将此行和下一行视为一行数据。如果不需要这种转义行为,可以将转义字符设置为其他字符
- nullValue: 用于判定什么样的内容应该是 Null,默认值为 `""`,例如 `'nullValue'='\\N'`或者`'nullValue'=r'\N'`为了避免在不确定何时需要转义的情况下手动处理转义字符,Lakehouse支持在字符串前添加
r
前缀,表示字符串中的转义字符不会被转义,可以直接输入到表达式中运行。
- multiLine: 是否有跨行的 csv 记录,默认值为
false
,如果有这样的记录,需要配置 multiLine='true'
- writebom: BOM(Byte Order Mark)是位于文件开头的特殊标记字符(U+FEFF),在CSV文件中主要作用包括:明确声明文件使用UTF-8/UTF-16等Unicode编码需要配置和 帮助软件自动识别编码格式,避免解码错误。Windows版Excel依赖BOM识别UTF-8编码文件*无BOM时中文/日文等非ASCII字符可能显示为乱码。推荐使用场景导出的CSV文件需要在Windows Excel中打开。
writebom=true
--指定分隔符为|和压缩
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= CSV
sep='|'
compression='gzip'
);
-
JSON格式支持的参数:
-
Parquet , ORC , BSON 格式:无
-
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= parquet
);
-
copyOptions
overwrite=true
:清空目标文件夹后导入(含子目录)。特别注意会清空子目录。
- filename_prefix = '<prefex_name>'。可选参数。设置文件前缀,例如:filename_prefix = 'my_prefix_'
--给文件添加前缀
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= json
)
filename_prefix='birds'
;
--查看目录,如第一行添加了前缀birds
SHOW VOLUME DIRECTORY my_volume SUBDIRECTORY 'dau_unload/';
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
| relative_path | url | size | last_modified_time |
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
| dau_unload/birds00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/birds00001.json | 295 | 2024-12-27 17:29:20 |
| dau_unload/part00001.csv | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv | 105 | 2024-12-27 17:15:40 |
| dau_unload/part00001.csv.gzip | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv.gzip | 104 | 2024-12-27 17:19:33 |
| dau_unload/part00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.json | 295 | 2024-12-27 17:24:26 |
| dau_unload/part00001.parquet | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.parquet | 1886 | 2024-12-27 17:27:15 |
| dau_unload/part00001.text | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.text | 87 | 2024-12-27 17:25:34 |
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
- filename_suffix = '<suffix>'。参数。设置文件后缀,例如:filename_suffix = '.data'
--添加后缀
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= json
)
filename_suffix='.data';
--查看目录,如第四行添加了前缀birds
SHOW VOLUME DIRECTORY my_volume SUBDIRECTORY 'dau_unload/';
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
| relative_path | url | size | last_modified_time |
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
| dau_unload/birds00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/birds00001.json | 295 | 2024-12-27 17:29:20 |
| dau_unload/part00001.csv | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv | 105 | 2024-12-27 17:15:40 |
| dau_unload/part00001.csv.gzip | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv.gzip | 104 | 2024-12-27 17:19:33 |
| dau_unload/part00001.data | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.data | 295 | 2024-12-27 17:33:49 |
| dau_unload/part00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.json | 295 | 2024-12-27 17:24:26 |
| dau_unload/part00001.parquet | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.parquet | 1886 | 2024-12-27 17:27:15 |
| dau_unload/part00001.text | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.text | 87 | 2024-12-27 17:25:34 |
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
- include_job_id = 'TRUE' | 'FALSE'。 可选参数。设置文件名是否写入作业ID,不设置时默认为不写入作业ID。例如:include_job_id = 'TRUE'
--导出的文件包含jobid
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= json
)
include_job_id = 'TRUE' ;
--查看目录,如第8行包含了导出的文件包含了jobid
+--------------------------------------------------------+----------------------------------------------------------------------------------------------+------+---------------------+
| relative_path | url | size | last_modified_time |
+--------------------------------------------------------+----------------------------------------------------------------------------------------------+------+---------------------+
| dau_unload/birds00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/birds00001.json | 295 | 2024-12-27 17:29:20 |
| dau_unload/part00001.csv | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv | 105 | 2024-12-27 17:15:40 |
| dau_unload/part00001.csv.gzip | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv.gzip | 104 | 2024-12-27 17:19:33 |
| dau_unload/part00001.data | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.data | 295 | 2024-12-27 17:33:49 |
| dau_unload/part00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.json | 295 | 2024-12-27 17:24:26 |
| dau_unload/part00001.parquet | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.parquet | 1886 | 2024-12-27 17:27:15 |
| dau_unload/part00001.text | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.text | 87 | 2024-12-27 17:25:34 |
| dau_unload/part202412271736045501gmspelya5o900001.json | oss://lakehouse-perf-test/test_insert/dau_unload/part202412271736045501gmspelya5o900001.json | 295 | 2024-12-27 17:36:04 |
+--------------------------------------------------------+----------------------------------------------------------------------------------------------+------+---------------------+
使用示例
导出数据到user volume中
CREATE TABLE birds (
id INT,
name VARCHAR(50),
wingspan_cm FLOAT,
colors STRING
);
INSERT INTO birds (id, name, wingspan_cm, colors) VALUES
(1, 'Sparrow', 15.5, 'Brown'),
(2, 'Blue Jay', 20.2, 'Blue'),
(3, 'Cardinal', 22.1, 'Red'),
(4, 'Robin', 18.7, 'Red","Brown');
COPY INTO USER VOLUME SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
--查看是否导出成功
SHOW USER VOLUME DIRECTORY SUBDIRECTORY 'dau_unload/';
--删除文件避免占用存储
REMOVE VOLUME my_volume FILE 'dau_unload/part00001.csv';
导出数据到table volume中
CREATE TABLE birds (
id INT,
name VARCHAR(50),
wingspan_cm FLOAT,
colors STRING
);
INSERT INTO birds (id, name, wingspan_cm, colors) VALUES
(1, 'Sparrow', 15.5, 'Brown'),
(2, 'Blue Jay', 20.2, 'Blue'),
(3, 'Cardinal', 22.1, 'Red'),
(4, 'Robin', 18.7, 'Red","Brown');
COPY INTO TABLE VOLUME birds SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
--查看是否导出成功
SHOW TABLE VOLUME DIRECTORY birds SUBDIRECTORY 'dau_unload/';
--删除文件避免占用存储
REMOVE TABLE VOLUME birds FILE 'dau_unload/part00001.csv';
导出到外部volume中使用前提需要创建VOLUME和CONNECTION。创建过程可以参考CONNECTION创建和VOLUME创建
导出数据到oss中
--创建表
CREATE TABLE birds (
id INT,
name VARCHAR(50),
wingspan_cm FLOAT,
colors STRING
);
INSERT INTO birds (id, name, wingspan_cm, colors) VALUES
(1, 'Sparrow', 15.5, 'Brown'),
(2, 'Blue Jay', 20.2, 'Blue'),
(3, 'Cardinal', 22.1, 'Red'),
(4, 'Robin', 18.7, 'Red","Brown');
--创建oss conenction
CREATE STORAGE CONNECTION catalog_storage_oss
type OSS
ACCESS_ID='xxxx'
ACCESS_KEY='xxxxxxx'
ENDPOINT='oss-cn-hangzhou-internal.aliyuncs.com';
--创建volume
CREATE EXTERNAL VOLUME my_volume
location 'oss://mybucket/test_insert/'
using connection catalog_storage_oss
directory = (
enable=true,
auto_refresh=true
);
--将数据导出到test_insert子目录下
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
导出数据到cos中
--创建表
CREATE TABLE birds (
id INT,
name VARCHAR(50),
wingspan_cm FLOAT,
colors STRING
);
INSERT INTO birds (id, name, wingspan_cm, colors) VALUES
(1, 'Sparrow', 15.5, 'Brown'),
(2, 'Blue Jay', 20.2, 'Blue'),
(3, 'Cardinal', 22.1, 'Red'),
(4, 'Robin', 18.7, 'Red","Brown');
--创建cos conenction
CREATE STORAGE CONNECTION my_conn
TYPE COS
ACCESS_KEY = '<access_key>'
SECRET_KEY = '<secret_key>'
REGION = 'ap-shanghai'
APP_ID = '1310000503';
--创建volume
CREATE EXTERNAL VOLUME my_volume
location 'cos://mybucket/test_insert/'
using connection my_conn
directory = (
enable=true,
auto_refresh=true
);
--将数据导出到test_insert子目录下
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
导出数据到s3中
--创建表
CREATE TABLE birds (
id INT,
name VARCHAR(50),
wingspan_cm FLOAT,
colors STRING
);
INSERT INTO birds (id, name, wingspan_cm, colors) VALUES
(1, 'Sparrow', 15.5, 'Brown'),
(2, 'Blue Jay', 20.2, 'Blue'),
(3, 'Cardinal', 22.1, 'Red'),
(4, 'Robin', 18.7, 'Red","Brown');
--创建s3 conenction
CREATE STORAGE CONNECTION aws_bj_conn
TYPE S3
ACCESS_KEY = 'AKIAQNBSBP6EIJE33***'
SECRET_KEY = '7kfheDrmq***************************'
ENDPOINT = 's3.cn-north-1.amazonaws.com.cn'
REGION = 'cn-north-1';
--创建volume
CREATE EXTERNAL VOLUME my_volume
location 's3://mybucket/test_insert/'
using connection aws_bj_conn
directory = (
enable=true,
auto_refresh=true
);
--将数据导出到test_insert子目录下
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
导出EXCEL兼容格式避免乱码,并下载到本地
COPY INTO user volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= CSV
writebom=true
);