[root@hadoop102 ~]# vim /etc/profile.d/my_env.sh #MYSQL_HOME export MYSQL_HOME=/opt/module/mysql-5.7.20 export PATH=$PATH:$MYSQL_HOME/bin
[root@hadoop102 ~]# source /etc/profile
安装依赖包
1
[root@hadoop102 ~]# yum install libaio
初始化MySQL
初始化创建mysql数据库
1 2 3 4 5 6 7
[root@hadoop102 ~]# mysqld --initialize --user=mysql --basedir=/opt/module/mysql-5.7.20/ --datadir=/data/mysql/ 2022-05-11T10:50:24.777743Z 0 [Warning] TIMESTAMP with implicit DEFAULT value is deprecated. Please use --explicit_defaults_for_timestamp server option (see documentation for more details). 2022-05-11T10:50:25.027412Z 0 [Warning] InnoDB: New log files created, LSN=45790 2022-05-11T10:50:25.068499Z 0 [Warning] InnoDB: Creating foreign key constraint system tables. 2022-05-11T10:50:25.139430Z 0 [Warning] No existing UUID has been found, so we assume that this is the first time that this server has been started. Generating a new UUID: 29be3489-d118-11ec-ba45-000c2900c11d. 2022-05-11T10:50:25.140670Z 0 [Warning] Gtid table is not ready to be used. Table 'mysql.gtid_executed' cannot be opened. 2022-05-11T10:50:25.141518Z 1 [Note] A temporary password is generated for root@localhost: L2)cw18vE8lw
配置mysql.cnf文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# 找到配置文件所在位置 [root@hadoop102 ~]# mysql --help | grep cnf order of preference, my.cnf, $MYSQL_TCP_PORT, /etc/my.cnf /etc/mysql/my.cnf /usr/local/mysql/etc/my.cnf /opt/module/mysql-5.7.20/my.cnf ~/.my.cnf # 编辑配置文件 [root@hadoop102 ~]# vim /etc/my.cnf [mysqld] user=mysql port=3306 basedir=/opt/module/mysql-5.7.20 datadir=/data/mysql server_id=1 socket=/tmp/mysql.sock [mysql] socket=/tmp/mysql.sock
配置mysql服务
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
[root@hadoop102 ~]# vim /usr/lib/systemd/system/mysqld.service
<!-- Bind host on which to run the HiveServer2 Thrift interface --> <property> <name>hive.server2.thrift.bind.host</name> <value>hadoop102</value> </property>
[eitan@hadoop102 ~]$ sudo vim /opt/module/apache-hive-3.1.3/conf/hive-site.xml # 修改<value>为hadoop103 <!-- Bind host on which to run the HiveServer2 Thrift interface --> <property> <name>hive.server2.thrift.bind.host</name> <value>hadoop103</value> </property>
-- 创建原数据表,并把响应数据放入目标目录下 CREATETABLE t_student ( id int COMMENT "ID", name string COMMENT "学生姓名", sex string COMMENT "姓名", age int COMMENT "年龄", area string COMMENT "地区" ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ",";
-- 创建动态分区数据表 CREATETABLE t_dynamic_student_partition ( id int COMMENT "ID", name string COMMENT "学生姓名", sex string COMMENT "姓名" ) PARTITIONED BY (area string COMMENT "地区",age int COMMENT "年龄") ROW FORMAT DELIMITED FIELDS TERMINATED BY ",";
-- 通过 INSERT...SELECT 语句插入数据,Hive会启动MAPREDUCE逐行扫描按所定义分区将数据划分到不懂的子目录中 INSERTINTOTABLE t_dynamic_student_partition SELECT id, name, sex, area, age FROM t_student;
--step1:开启分桶的功能 从Hive2.0开始不再需要设置 set hive.enforce.bucketing=true;
--step2:创建分桶表 CREATETABLE t_usa_covid19_bucket_sort ( count_date string COMMENT "统计日期", county string COMMENT "县", state string COMMENT "州", fips int COMMENT "县编码 code", cases int COMMENT "累计确诊病例", deaths int COMMENT "累计死亡病例" ) -- 按 state 分组后按 case 降序排序 CLUSTERED BY (state) SORTED BY (cases DESC) INTO5 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY','; --step3:创建普通表,并上传数据 CREATETABLE t_usa_covid19 ( count_date string COMMENT "统计日期", county string COMMENT "县", state string COMMENT "州", fips int COMMENT "县编码 code", cases int COMMENT "累计确诊病例", deaths int COMMENT "累计死亡病例" ) ROW FORMAT DELIMITED FIELDS TERMINATED BY',';
-- 1、Hive是否支持并发 set hive.support.concurrency = true; -- 2、设置事务管理器 set hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; -- 3、创建事务表 CREATE TABLE trans_student ( id int, name string, area string ) CLUSTERED BY (id) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ("transactional" = "true"); -- 4、往事务表插入数据 INSERT INTO trans_student SELECT id, name, area FROM t_student; -- 5、不用物化视图查询聚合结果 SELECT area, count(*) FROM trans_student GROUP BY area; -- 6、创建物化视图 CREATE MATERIALIZED VIEW m_student_agg AS SELECT age, count(*) FROM trans_student GROUP BY age; -- 7、再次查询 SELECT area, count(*) FROM trans_student GROUP BY area; -- 8、失能物化视图 ALTER MATERIALIZED VIEW m_student_agg DISABLE REWRITE; -- 9、查询执行计划,发现查询的表重物化视图又改回了原事务表 EXPLAIN SELECT area, count(*) FROM trans_student GROUP BY area;
-- 1、更改表名 ALTERTABLE table_name RENAME TO new_table_name; -- 2、更改表属性 ALTERTABLE table_name SET TBLPROPERTIES (property_name = property_value, ... ); -- 3、更改表注释 ALTERTABLE student SET TBLPROPERTIES ('comment'= "new comment for student table"); -- 4、更改SerDe属性 ALTERTABLE table_name SET SERDE serde_class_name [WITH SERDEPROPERTIES (property_name = property_value, ... )]; ALTERTABLE table_name [PARTITION partition_spec] SET SERDEPROPERTIES serde_properties; ALTERTABLE table_name SET SERDEPROPERTIES ('field.delim'=','); -- 5、移除SerDe属性 ALTERTABLE table_name [PARTITION partition_spec] UNSET SERDEPROPERTIES (property_name, ... );
-- 6、更改表的文件存储格式 该操作仅更改表元数据。现有数据的任何转换都必须在Hive之外进行。 ALTERTABLE table_name SET FILEFORMAT file_format; -- 7、更改表的存储位置路径 ALTERTABLE table_name SET LOCATION "new location";
-- 8、更改列名称/类型/位置/注释 CREATETABLE test_change (a int, b int, c int); -- First change column a's name to a1. ALTERTABLE test_change CHANGE a a1 INT; -- Next change column a1's name to a2, its data type to string, and put it after column b. ALTERTABLE test_change CHANGE a1 a2 STRING AFTER b; -- The new table's structure is: b int, a2 string, c int. -- Then change column c's name to c1, and put it as the first column. ALTERTABLE test_change CHANGE c c1 INTFIRST; -- The new table's structure is: c1 int, b int, a2 string. -- Add a comment to column a1 ALTERTABLE test_change CHANGE a1 a1 INT COMMENT 'this is column a1';
-- 1、显示所有数据库 SCHEMAS和DATABASES的用法 功能一样 SHOW DATABASES|SCHEMAS;
-- 2、显示当前数据库所有表/视图/物化视图/分区/索引 SHOW TABLES [IN database_name]; --指定某个数据库
-- 3、显示当前数据库下所有视图 SHOW VIEWS; -- show all views that start with "test_" SHOW VIEWS 'test_*'; -- show views from database database_name SHOW VIEWS [IN/FROM database_name];
-- 4、显示当前数据库下所有物化视图 SHOW MATERIALIZED VIEWS [IN/FROM database_name];
-- 5、显示表分区信息,分区按字母顺序列出,不是分区表执行该语句会报错 SHOW PARTITIONS table_name;
-- 6、显示表/分区的扩展信息 SHOWTABLE EXTENDED [IN|FROM database_name] LIKE table_name;
Specifies the number of delta directories in a partition that triggers an automatic minor compaction.The default value is 10.
hive.compactor.delta.pct.threshold
Specifies the percentage size of delta files relative to the corresponding base files that triggers an automatic major compaction. The default value is. 1, which is 10 percent.
DML-Update、Delete更新、删除数据
见Hive事务表-Hive事务表实践
Hive 的数据查询语言(DQL)
语法树
1 2 3 4 5 6 7 8 9 10
[WITH CommonTableExpression (, CommonTableExpression)*] SELECT [ALL|DISTINCT] select_expr, select_expr, ... FROM table_reference [WHERE where_condition] [GROUPBY col_list] [ORDERBY col_list] [CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY col_list] ] [LIMIT [offset,] rows]
在查询过程中执行顺序:from > where > group(含聚合)> having >order > select;
聚合语句(sum,min,max,avg,count)要比having子句优先执行
where子句在查询过程中执行优先级别优先于聚合语句(sum,min,max,avg,count)
1 2 3 4 5
select state,sum(deaths) as cnts from t_usa_covid19_p where count_date = "2021-01-28" groupby state having cnts>10000;
ORDER BY
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
---order by --根据字段进行排序 select*from t_usa_covid19_p where count_date = "2021-01-28" and state ="California" orderby deaths; --默认asc null first
select*from t_usa_covid19_p where count_date = "2021-01-28" and state ="California" orderby deaths desc; --指定desc null last
--强烈建议将LIMIT与ORDER BY一起使用。避免数据集行数过大 --当hive.mapred.mode设置为strict严格模式时,使用不带LIMIT的ORDER BY时会引发异常。 select*from t_usa_covid19_p where count_date = "2021-01-28" and state ="California" orderby deaths desc limit 3;
CLUSTER BY
1 2 3 4 5 6 7 8 9
--cluster by select*from student; --不指定reduce task个数 --日志显示:Number of reduce tasks not specified. Estimated from input data size: 1 select*from student cluster by sno;
--手动设置reduce task个数 set mapreduce.job.reduces =2; select*from student cluster by sno;
SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key1) --由于联接中仅涉及b的key1列,因此被转换为1个MR作业来执行,并且表a和b的键的特定值的值被缓冲在reducer的内存中。然后,对于从c中检索的每一行,将使用缓冲的行来计算联接。 SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key2) --计算涉及两个MR作业。其中的第一个将a与b连接起来,并缓冲a的值,同时在reducer中流式传输b的值。
SELECT/*+ STREAMTABLE(a) */ a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key1) --a,b,c三个表都在一个MR作业中联接,并且表b和c的键的特定值的值被缓冲在reducer的内存中。然后,对于从a中检索到的每一行,将使用缓冲的行来计算联接。如果省略STREAMTABLE提示,则Hive将流式传输最右边的表。
如果除一个要连接的表之外的所有表都很小,则可以将其作为仅map作业执行
1 2
SELECT/*+ MAPJOIN(b) */ a.key, a.value FROM a JOIN b ON a.key = b.key --不需要reducer。对于A的每个Mapper,B都会被完全读取。限制是不能执行FULL / RIGHT OUTER JOIN b。