现有如此三份数据: 1、users.dat 数据格式为: 2::M::56::16::70072 对应字段为:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String 对应字段中文解释:用户id,性别,年龄,职业,邮政编码
2、movies.dat 数据格式为: 2::Jumanji (1995)::Adventure|Children’s|Fantasy
对应字段为:MovieID BigInt, Title String, Genres String 对应字段中文解释:电影ID,电影名字,电影类型
3、ratings.dat 数据格式为: 1::1193::5::978300760 对应字段为:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String 对应字段中文解释:用户ID,电影ID,评分,评分时间戳
题目要求:
(1)写shell脚本清洗数据。(hive不支持解析多字节的分隔符,也就是说hive只能解析’:’, 不支持解析’::’,所以用普通方式建表来使用是行不通的,要求对数据做一次简单清洗) (2)使用Hive能解析的方式进行
首先创建一个影评数据库 create database if not exists yingping; 进入数据库 use yingping; 创建表
create external table if not exists users(UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String) row format serde ‘org.apache.hadoop.hive.serde2.RegexSerDe’ with serdeproperties(‘input.regex’=’(.):?.):?.):?.):?.*)’,‘output.format.string’=’%1 s s %2 ss %3 s s %4 ss %5$s’) stored as textfile location “/user/data/yingping/users”; 在hdfs中直接将数据放入到"/user/data/yingping/users"下就可以直接访问数据了,也可以用 load data locla inpath “hdts中的全路径名” into table user表名 验证是否创建成功 测试时最好set hive.exec.mode.local.auto=true;设置本地模式,这样可以提高执行效率,注意这仅仅是在练习是用
select * from users limit 5;
注意:在hive 中禁用 select * from tnname; 一定配合 limit使用,因为一张
create external table if not exists movies(MovieID BigInt, Title String, Genres String) row format serde ‘org.apache.hadoop.hive.serde2.RegexSerDe’ with serdeproperties(‘input.regex’=’(.):?.):?.*)’,‘output.format.string’=’%1 s s %2 ss %3$s’) stored as textfile location “/user/data/yingping/movies”;
select * from movies limit 10;
create external table if not exists ratings(UserID BigInt, MovieID BigInt, Rating Double, Timestamped String) row format serde ‘org.apache.hadoop.hive.serde2.RegexSerDe’ with serdeproperties(‘input.regex’=’(.):?.):?.):?.)’,‘output.format.string’=’%1 s s %2 ss %3 s s %4 ss’) stored as textfile location “/user/data/yingping/ratings”;
select * from ratings limit 10;
set hive.exec.mode.local.auto=true;本地模式
select count(*) rcount,title from ratings a join movies b on a.movieid=b.movieid group by a.movieid ,b.title order by rcount desc limit 10;
select c.Gender,b.title,avg(a.rating) avga from ratings a join movies b on a.movieid=b.movieid join users c on a.userid=c.userid where c.Gender=“M” group by b.movieid,b.title,c.Gender order by avga desc limit 10;
select c.Gender,b.title,avg(a.rating) avga from ratings a join movies b on a.movieid=b.movieid join users c on a.userid=c.userid where c.Gender=“F” group by b.movieid,b.title,c.Gender order by avga desc limit 10;
select b.Age,avg(a.rating) avg from ratings a join users b on a.userid=b.userid group by b.Age;
select g.userid,h.title,g.rating from (select e.userid,e.movieid,e.rating from (select d.userid userid,avg(d.rating) rating,d.movieid movieid from (select count(*) count,b.userid userid from ratings a join users b on a.userid=b.userid where b.Gender=“F” group by b.userid order by count desc limit 1) c left join ratings d on c.userid=d.userid group by d.userid,d.movieid) e join (select count(distinct a.userid) ypcount,b.movieid movieid from users a join ratings b on a.userid=b.userid where a.Gender=“F” group by b.movieid having ypcount>50 order by ypcount desc) f on e.movieid=f.movieid order by e.rating desc limit 10)g join movies h on g.movieid=h.movieid;
select avg(a.rating) rating,substr(b.Title,-5,4) year,b.movieid movieid from ratings a join movies b on a.movieid=b.movieid group by substr(b.Title,-5,4),b.movieid having rating >4 order by rating desc limit 10;
select avg(b.rating) rating,a.title from(select movieid,title,leixing.lx Genres from movies lateral view explode(split(Genres,"\|"))leixing as lx) a join ratings b on a.movieid=b.movieid where lower(a.Genres)=“comedy” and substr(a.Title,-5,4)=“1997” group by a.movieid,a.Title order by rating desc limit 10;
select * from (select c.title title,c.Genres,c.rating,row_number() over(distribute by c.Genres sort by c.rating desc) index from (select a.title title,avg(b.rating) rating,a.movieid,a.Genres from (select movieid,title,leixing.lx Genres from movies lateral view explode(split(Genres,"\|"))leixing as lx) a join ratings b on a.movieid=b.movieid group by a.Genres,a.movieid,a.title) c) d where d.index<=5 ;