Hive QL常用操作

一、数据定义(DDL)

创建修改和删除数据库、表、视图、函数和索引

1.1 创建、修改和删除数据库

1
2
3
4
5
6
7
8
9
10
create database if not exists hive;       #创建数据库
show databases; #查看Hive中有哪些数据库
show databases like 'h.*'; #查看Hive中以h开头数据库
describe database db_name; #查看hive数据库位置等信息
alter database hive set dbproperties('name'= 'test'); #为hive数据库设置键值对属性
desc database extended hive;
use hive; #切换到hive数据库下
drop database if exists hive; #删除不含表的数据库
drop database if exists hive cascade; #删除数据库和它中的表
select current_database(); #查看当前数据库

1.2 创建、修改和删除表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#创建内部表(管理表)
create table if not exists hive.usr(
name string comment 'username',
pwd string comment 'password',
address struct<street:string,city:string,state:string,zip:int>
comment 'home address',
identify map<int,tinyint> comment 'number,sex')
comment 'description of the table'
tblproperties('creator'='me','time'='2016.1.1');

#创建外部表
create external table if not exists usr2(
name string,
pwd string,
address struct<street:string,city:string,state:string,zip:int>,
identify map<int,tinyint>)
row format delimited fields terminated by ','
location '/usr/local/hive/warehouse/hive.db/usr';

#创建分区表
create table if not exists usr3(
name string,
pwd string,
address struct<street:string,city:string,state:string,zip:int>,
identify map<int,tinyint>)
partitioned by(city string,state string);

#复制usr表的表模式
create table if not exists hive.usr1 like hive.usr;

show tables in hive;
show tables 'u.*'; #查看hive中以u开头的表
describe hive.usr; #查看usr表相关信息
alter table usr rename to custom; #重命名表

#为表增加一个分区
alter table usr2 add if not exists
partition(city="beijing",state="China")
location '/usr/local/hive/warehouse/usr2/China/beijing';

#修改分区路径
alter table usr2 partition(city="beijing",state="China")
set location '/usr/local/hive/warehouse/usr2/CH/beijing';

#删除分区
alter table usr2 drop if exists partition(city="beijing",state="China")

#修改列信息
alter table usr change column pwd password string after address;

alter table usr add columns(hobby string); #增加列
alter table usr replace columns(name1 string,name1 string); #删除替换列
alter table usr set tblproperties('creator'='liming'); #修改表属性
alter table usr2 partition(city="beijing",state="China") #修改存储属性
set fileformat sequencefile;
drop table if exists usr1; #删除表

1.3 视图和索引的创建、修改和删除

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#创建视图
create view view_name as select * from table;
#查看视图定义
show create table view_name;
#修改视图
alter view view_name set tblproperties('comment'='this is a view');
#查看视图属性
desc extended view_name;
#删除视图
drop view if exists view_name;

#创建索引
create index index_name on table table_name(partition_name/column_name) as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' with deferred rebuild in table new_table_name;
#生成索引数据(刚创建完的Hive索引表是没有数据的,需要生成索引数据)
alter index index_name on new_table_name rebuild;
#重建索引
alter index index_name on table_name [PARTITION partition_spec] rebuild;
#显示索引
show formatted index on table_name;
#删除索引
drop index if exists index_name on table_name;

1.4 用户自定义函数

1
2
3
4
5
6
7
8
9
10
#显示hive中的内置函数
show functions;
#查看具体的函数使用方法
describe function abs;

#创建函数
add jar <jar文件的绝对路径>;
create temporary function function_name;
#删除函数
drop temporary function if exists function_name;

二、数据操作(DML)

2.1 向表中装载数据

创建两个表 stu 和 course,terminated by '\t' 表示根据每行数据的'\t'制表符来区分字段的

1
2
create table if not exists stu(id int,name string) row format delimited fields terminated by '\t';
create table if not exists course(cid int,sid int) row format delimited fields terminated by '\t';

2.1.1 从文件中导入

1
2
3
4
5
6
7
$ cat /tmp/stu.txt
1 aa
2 bb
3 cc

#如果stu.txt文件存储在HDFS上,则不需要 local 关键字。
load data local inpath '/tmp/stu.txt' overwrite into table stu;

2.1.2 通过查询语句插入

1
2
3
4
5
#创建表并通过查询写入数据
create table stu1 as select id,name from stu;

#向已有的表中插入数据
insert overwrite table stu1 select id,name from stu where(条件);

2.2 从表中导出数据

通过 hadoop fs 拷贝

1
hadoop  fs -cp source_path target_path;

查询写入本地文件

1
insert overwrite local directory '/tmp/stu' select id,name from stu;

Hive Shell 命令导出

1
hive -e 'select * from default.student;' > /tmp/student.txt;

Expor 导出到 HDFS 上

1
export table default.student to '/user/hive/warehouse/export/student';

Sqoop 导出 ,从Hive导出到MySQL

1
2
3
sqoop export 
--connect jdbc:mysql://localhost:3306/sqooptest --username root --password 123qwe --table sqoop_test
--export-dir /user/hive/external/sqoop_test --input-fields-terminated-by ,

2.3 数据备份与恢复

备份

1
2
#把src_commit表中的数据以'|'为分隔符号,并备份到"/backup/src_commit"
insert overwrite local directory '/backup/src_commit' ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE select * from src_commit;

恢复

1
2
3
4
5
6
7
8
9
10
11
#创建表
#在要导入的数据库中创建对应的表
CREATE TABLE IF NOT EXISTS src_commit_01
(
id bigint comment '主键id',
product_name string comment '所属业务'
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE;

#导入备份数据
LOAD DATA LOCAL INPATH '/backup/src_commit' OVERWRITE INTO TABLE src_commit_01;

2.4 更新和删除数据

更新和删除的语法比较简单,和关系型数据库一致。需要注意的是这两个操作都只能在支持 ACID 的表,也就是事务表上才能执行。

更新
UPDATE table_name SET column = value [, column = value …] [WHERE expression]

删除
DELETE FROM table_name [WHERE expression]

2.4.1 修改配Hive置

修改 hive-site.xml,添加如下配置,开启事务支持,配置完成后需要重启 Hive 服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
<property>
<name>hive.support.concurrency</name>
<value>true</value>
</property>
<property>
<name>hive.enforce.bucketing</name>
<value>true</value>
</property>
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
</property>
<property>
<name>hive.txn.manager</name>
<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>
<property>
<name>hive.compactor.initiator.on</name>
<value>true</value>
</property>
<property>
<name>hive.in.test</name>
<value>true</value>
</property>

2.4.2 测试更新和删除

创建用于测试的事务表,建表时候指定属性 transactional = true 则代表该表是事务表。需要注意的是,按照官方文档 的说明,目前 Hive 中的事务表有以下限制:

  • 必须是 buckets Table;
  • 仅支持 ORC 文件格式;
  • 不支持 LOAD DATA …语句。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#创建测试表
CREATE TABLE emp_ts(
id int,
name String
)
CLUSTERED BY (id) INTO 2 BUCKETS STORED AS ORC
TBLPROPERTIES ("transactional"="true");

#插入数据
INSERT INTO TABLE emp_ts VALUES (1,"a"),(2,"b");

#插入数据依靠的是 MapReduce 作业,执行成功后数据如下
select * from emp_ts;
+---------------+---------------+
| emp_ts.id | emp_ts.name |
+---------------+---------------+
| 2 | b |
| 1 | a |
+---------------+---------------+

#更新数据
UPDATE emp_ts SET name="c" WHERE id=1;

#删除数据
DELETE FROM emp_ts WHERE id=2;
select * from emp_ts;
+---------------+---------------+
| emp_ts.id | emp_ts.name |
+---------------+---------------+
| 1 | c |
+---------------+---------------+

2.4.3 清空表数据

1
2
3
4
5
6
7
#删除表数据和结构可以使用drop
drop table emp_ts;

#清空表数据,但保留表结构,可以使用truncate和delete
truncate table emp_ts;
delete from emp_ts;
#它们的区别在于delete会调mapr、educe,而truncate不会

2.5 Merge 合并

MERGE 从Hive 2.2开始可用。
只能在支持 ACID 的表上执行合并。有关详细信息,请参阅Hive 事务

语法如下

1
2
3
4
5
6
Standard Syntax:
MERGE INTO <target table> AS T USING <source expression/table> AS S
ON <boolean expression1>
WHEN MATCHED [AND <boolean expression2>] THEN UPDATE SET <set clause list>
WHEN MATCHED [AND <boolean expression3>] THEN DELETE
WHEN NOT MATCHED [AND <boolean expression4>] THEN INSERT VALUES<value list>

2.5.1 建表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
CREATE DATABASE merge_data;

CREATE TABLE merge_data.transactions(
ID int,
TranValue string,
last_update_user string)
PARTITIONED BY (tran_date string)
CLUSTERED BY (ID) into 5 buckets
STORED AS ORC TBLPROPERTIES ('transactional'='true');

CREATE TABLE merge_data.merge_source(
ID int,
TranValue string,
tran_date string)
STORED AS ORC;

2.5.2 导入数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
INSERT INTO merge_data.merge_source VALUES 
(1, 'value_01', '20170410'),
(4, NULL, '20170410'),
(7, 'value_77777', '20170413'),
(8, NULL, '20170413'),
(8, 'value_08', '20170415'),
(11, 'value_11', '20170415');

INSERT INTO merge_data.transactions PARTITION (tran_date) VALUES
(1, 'value_01', 'creation', '20170410'),
(2, 'value_02', 'creation', '20170410'),
(3, 'value_03', 'creation', '20170410'),
(4, 'value_04', 'creation', '20170410'),
(5, 'value_05', 'creation', '20170413'),
(6, 'value_06', 'creation', '20170413'),
(7, 'value_07', 'creation', '20170413'),
(8, 'value_08', 'creation', '20170413'),
(9, 'value_09', 'creation', '20170413'),
(10, 'value_10','creation', '20170413');

2.5.3 Merge 操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#需要修改配置
set hive.auto.convert.join=false;

#Merge 操作
MERGE INTO merge_data.transactions AS T
USING merge_data.merge_source AS S
ON T.ID = S.ID and T.tran_date = S.tran_date
WHEN MATCHED AND (T.TranValue != S.TranValue AND S.TranValue IS NOT NULL) THEN UPDATE SET
TranValue = S.TranValue
,last_update_user = 'merge_update'
WHEN MATCHED AND S.TranValue IS NULL THEN DELETE
WHEN NOT MATCHED THEN INSERT VALUES (
S.ID
, S.TranValue
, 'merge_insert'
, S.tran_date
);

#查看Merge 后的数据,使用order by会走map、reduce
select * from transactions order by id;
+------------------+-------------------------+--------------------------------+-------------------------+
| transactions.id | transactions.tranvalue | transactions.last_update_user | transactions.tran_date |
+------------------+-------------------------+--------------------------------+-------------------------+
| 1 | value_01 | creation | 20170410 |
| 2 | value_02 | creation | 20170410 |
| 3 | value_03 | creation | 20170410 |
| 5 | value_05 | creation | 20170413 |
| 6 | value_06 | creation | 20170413 |
| 7 | value_77777 | merge_update | 20170413 |
| 8 | value_08 | merge_insert | 20170415 |
| 9 | value_09 | creation | 20170413 |
| 10 | value_10 | creation | 20170413 |
| 11 | value_11 | merge_insert | 20170415 |
+------------------+-------------------------+--------------------------------+-------------------------+

三、数据查询(DQL)

Hive 查询语句是 SELECT 语句,语法如下

1
2
3
4
5
6
7
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[ORDER BY col_list]
[CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY col_list]]
[LIMIT [offset,] rows]

3.1 Where 条件

类似我们传统SQL的where 条件,目前支持 AND , OR ,0.9版本支持between IN, NOT IN 不支持EXIST , NOT EXIST

ORDER BYSORT BY的不同: ORDER BY 全局排序,只有一个Reduce任务 SORT BY 只在本机做排序

1
select * from emp where deptid=31;

3.2 基于分区查询

当指定表有分区定义(即 PARTITIONED BY 子句),查询可以在指定的分区范围内进行,否则会扫描整个表;此外,要想让分区内查询,必须在 WHERE 或 JOIN … ON … 子句上明确指定分区字段信息。

假设表的分区信息是日期字段 dt,那分区查询如下

1
2
SELECT * FROM t1
WHERE dt >= '2020-09-15' AND dt <= '2020-10-08'

如果表 t1 和 t2 通过 JOIN 关联时,若要分区起作用,需要在 ON 上指定分区信息

1
2
3
SELECT t1.*
FROM t1 JOIN t2
ON (t1.user_id = t2.id AND t1.dt >= '2020-09-15' AND t1.dt <= '2020-10-08')

3.3 GROUP BY 分组统计

Hive 提供 GROUP BY 子句做聚合操作

1
SELECT col1 FROM t1 GROUP BY col1

3.4 Having 过滤分组统计结果

聚合语句 GROUP BY 常和 HAVING 子句结合使用,用于指定聚合条件。
Hive 在 0.7.0 版本开始支持 HAVING,之前老版本需要通过其它 SELECT 子句才能做到

1
SELECT col1 FROM t1 GROUP BY col1 HAVING SUM(col2) > 10

等同于

1
SELECT col1 FROM (SELECT col1, SUM(col2) AS col2sum FROM t1 GROUP BY col1) t2 WHERE t2.col2sum > 10

3.5 Limit 限制返回记录数量

Limit 可以限制查询的记录数,例如,返回前 5 条记录

1
2
3
4
SELECT * FROM t1 LIMIT 5

#返回第 3 个到第 7 个的记录
SELECT * FROM t1 LIMIT 2, 5

实现Top k 查询

下面的查询语句查询销售记录最大的 5 个销售代表

1
2
SET mapred.reduce.tasks = 1 
SELECT * FROM test SORT BY amount DESC LIMIT 5

正则表达式规范

SELECT 语句可以使用正则表达式做列选择,前提是需要设置set hive.support.quoted.identifiers=none;,下面的语句查询除了 ds 和 hr 之外的所有列

1
SELECT `(ds|hr)?+.+` FROM test

3.6 统计函数

与SQL一样,可以使用count( )函数统计记录条数,对数值型字段,可以用max( )函数求最大值,用min( )函数求最小值,用sum( )函数求和,用avg( )函数求平均值。使用这些函数,都会触发MapReduce操作。

1
SELECT count(*),max(degree),min(degree),avg(degree),sum(degree) FROM emp;

3.7 distinct 去除重复值

1
select distinct(deptid) from emp;

3.8 like 模糊查询

like如同SQL语句,可以对字符型字段进行模糊查询

1
select * from emp where ename like '%o%';

3.9 having 过滤分组统计结果

1
select deptid,avg(degree) from emp group by deptid having avg(degree)>3;

3.10 inner join 内联接

1
2
3
select e.*,d.* from emp e inner join dept d on e.deptid=d.deptid;
#inner关键字可省略,例如以下命令
select e.*,d.* from emp e join dept d on e.deptid=d.deptid;

3.11 left outer join和right outer join外联接

left outer join表示左外联接,right outer join表示右外联接

1
2
3
4
5
#left outer join显示左表全部数据。如果右表没有数据与之对应,则显示NULL
select e.*,d.* from emp e left outer join dept d on e.deptid=d.deptid;

#right outer join显示右表全部数据,如果左表没有数据与之对应,则显示NULL
select e.*,d.* from emp e right outer join dept d on e.deptid=d.deptid;

3.12 full outer join 外部联接

full outer join表示完全外部联接

1
2
select e.*,d.* from emp e full outer join dept d on e.deptid=d.deptid;
#以上命令执行后会显示full outer join左右两表的全部数据。如果左表或右表中没有对应数据,则显示为NULL

3.13 order by 排序

order by后面指明排序字段,默认是按asc升序排序,也可用desc指示降序排序。

1
2
3
select * from emp order by deptid;
#或使用如下命令
select * from emp order by deptid desc;

更多操作请参考:Hive语言手册

-------------本文结束感谢您的阅读-------------