hdfs集群三個節(jié)點內存 : 8G ,4G ,4G
Hive 配置spark 引擎提交查詢
最新更新:使用sparlSql 也能執(zhí)行成功,SparkSql 和 hive on spark 都是使用spark計算引擎,但是兩#者在SQL 語法解析不相同。
目標: 請寫出代碼求得所有用戶和活躍用戶的總數(shù)及平均年齡。(活躍用戶指連續(xù)兩天都有訪問記錄的用戶)日期 用戶 年齡
解法1:
select
sum(allCount) ,sum(allAvg) ,sum(twiceCount) ,sum(twiceAvg)
from
(select
count(*) as allCount,
-- AVG(t.age) as avgAge ,
sum(t.age) / count(*) as allAvg ,
0 as twiceCount ,
0 as twiceAvg
from
(select
user_id , age
from user_age group by user_id , age ) t
union all
select
0 as allCount ,
0 as allAvg ,
count(*) as twiceCount ,
sum(t4.age)/count(*) as twiceAvg
from
(select
t3.user_id ,
min(t3.age) as age
from
( select
t2.user_id ,
t2.age
from
( SELECT
t1.user_id ,
t1.age ,
date_sub(t1.dt , t1.rn) as diff
from
(select
t.user_id ,
t.age ,
t.dt ,
ROW_NUMBER () over(PARTITION by t.user_id order by t.dt ) rn
from
(select
user_id ,
age ,
dt
from user_age group by user_id , age ,dt ) t) t1 ) t2 group by t2.user_id ,
t2.age ,
t2.diff HAVING count(t2.user_id) >= 2 ) t3 group by t3.user_id) t4 group by t4.user_id ) t5
解法2:
select
sum(allCount) ,
sum(allAvg) ,
sum(twiceCount) ,
sum(twiceAvg)
from
(select
count(*) as allCount,
-- AVG(t.age) as avgAge ,
sum(t.age) / count(*) as allAvg ,
0 as twiceCount ,
0 as twiceAvg
from
(select
user_id , age
from user_age group by user_id , age ) t
union all
select
0 as allCount ,
0 as allAvg ,
count(*) as twiceCount ,
sum(t4.age)/count(*) as twiceAvg
from
(
select
t3.user_id ,
min(t3.age) as age
from
(
select
t2.user_id ,
t2.age ,
COUNT(t2.user_id) over(PARTITION BY t2.user_id,t2.diff) days
from
( SELECT
t1.user_id ,
t1.age ,
date_sub(t1.dt , t1.rn) as diff
from
(select
t.user_id ,
t.age ,
t.dt ,
ROW_NUMBER () over(PARTITION by t.user_id order by t.dt ) rn
from
(select
user_id ,
age ,
dt
from user_age group by user_id , age ,dt ) t) t1 ) t2
) t3 where t3.days >=2 group by t3.user_id) t4 group by t4.user_id ) t5 ;
解法2 使用spark 引擎報錯。
源數(shù)據(jù)
create table user_age(dt string,user_id string,age int)row format delimited fields terminated by ',';
2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19
- 使用開窗函數(shù)提示異常
select
t2.user_id
,t2.age
,COUNT(t2.user_id) over(PARTITION BY t2.user_id,t2.diff) as days
from
( SELECT
t1.user_id ,
t1.age ,
date_sub(t1.dt , t1.rn) as diff
from
(select
t.user_id ,
t.age ,
t.dt ,
ROW_NUMBER () over(PARTITION by t.user_id order by t.dt ) rn
from
(select
user_id ,
age ,
dt
from user_age group by user_id , age ,dt ) t) t1 ) t2
DBeaver異常顯示
org.jkiss.dbeaver.model.sql.DBSQLException: SQL 錯誤 [3] [42000]: Error while processing statement: FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.spark.SparkTask. Spark job failed during runtime. Please check stacktrace for the root cause.
at org.jkiss.dbeaver.model.impl.jdbc.exec.JDBCStatementImpl.executeStatement(JDBCStatementImpl.java:134)
at org.jkiss.dbeaver.ui.editors.sql.execute.SQLQueryJob.executeStatement(SQLQueryJob.java:487)
at org.jkiss.dbeaver.ui.editors.sql.execute.SQLQueryJob.lambda$0(SQLQueryJob.java:424)
at org.jkiss.dbeaver.model.exec.DBExecUtils.tryExecuteRecover(DBExecUtils.java:164)
at org.jkiss.dbeaver.ui.editors.sql.execute.SQLQueryJob.executeSingleQuery(SQLQueryJob.java:416)
at org.jkiss.dbeaver.ui.editors.sql.execute.SQLQueryJob.extractData(SQLQueryJob.java:774)
at org.jkiss.dbeaver.ui.editors.sql.SQLEditor$QueryResultsContainer.readData(SQLEditor.java:2914)
at org.jkiss.dbeaver.ui.controls.resultset.ResultSetJobDataRead.lambda$0(ResultSetJobDataRead.java:110)
at org.jkiss.dbeaver.model.exec.DBExecUtils.tryExecuteRecover(DBExecUtils.java:164)
at org.jkiss.dbeaver.ui.controls.resultset.ResultSetJobDataRead.run(ResultSetJobDataRead.java:108)
at org.jkiss.dbeaver.ui.controls.resultset.ResultSetViewer$17.run(ResultSetViewer.java:3421)
at org.jkiss.dbeaver.model.runtime.AbstractJob.run(AbstractJob.java:103)
at org.eclipse.core.internal.jobs.Worker.run(Worker.java:63)
Caused by: java.sql.SQLException: Error while processing statement: FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.spark.SparkTask. Spark job failed during runtime. Please check stacktrace for the root cause.
at org.apache.hive.jdbc.HiveStatement.waitForOperationToComplete(HiveStatement.java:354)
at org.apache.hive.jdbc.HiveStatement.execute(HiveStatement.java:245)
at org.jkiss.dbeaver.model.impl.jdbc.exec.JDBCStatementImpl.execute(JDBCStatementImpl.java:338)
at org.jkiss.dbeaver.model.impl.jdbc.exec.JDBCStatementImpl.executeStatement(JDBCStatementImpl.java:131)
... 12 more
yarn監(jiān)控:任務執(zhí)行結果

image.png
Failure Reason
Job aborted due to stage failure: Task 0 in stage 7.0 failed 4 times, most recent failure: Lost task 0.3 in stage 7.0 (TID 18, hadoop102, executor 2): java.lang.RuntimeException: java.lang.NullPointerException
at org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processVectorRow(SparkReduceRecordHandler.java:543)
at org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processRow(SparkReduceRecordHandler.java:300)
at org.apache.hadoop.hive.ql.exec.spark.HiveReduceFunctionResultList.processNextRecord(HiveReduceFunctionResultList.java:56)
at org.apache.hadoop.hive.ql.exec.spark.HiveReduceFunctionResultList.processNextRecord(HiveReduceFunctionResultList.java:28)
at org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:85)
at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127)
at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127)
at org.apache.spark.SparkContext$$anonfun$37.apply(SparkContext.scala:2212)
at org.apache.spark.SparkContext$$anonfun$37.apply(SparkContext.scala:2212)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
at org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDateAddColCol.evaluate(VectorUDFDateAddColCol.java:96)
at org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFOperator.process(VectorPTFOperator.java:366)
at org.apache.hadoop.hive.ql.exec.Operator.vectorForward(Operator.java:966)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:939)
at org.apache.hadoop.hive.ql.exec.vector.VectorSelectOperator.process(VectorSelectOperator.java:158)
at org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.forwardBatch(SparkReduceRecordHandler.java:549)
at org.apache.hadoop.hive.ql.exec.spark.SparkReduceRecordHandler.processVectorRow(SparkReduceRecordHandler.java:453)
... 19 more
Driver stacktrace:
失敗階段詳情:

image.png

image.png

image.png

image.png
- 如果使用分組函數(shù)就不會查詢失敗
select
t2.user_id
,t2.age
from
( SELECT
t1.user_id ,
t1.age ,
date_sub(t1.dt , t1.rn) as diff
from
(select
t.user_id ,
t.age ,
t.dt ,
ROW_NUMBER () over(PARTITION by t.user_id order by t.dt ) rn
from
(select
user_id ,
age ,
dt
from user_age group by user_id , age ,dt ) t) t1 ) t2
group by t2.user_id ,
t2.age ,
t2.diff HAVING count(t2.user_id) >= 2
- 設置執(zhí)行引擎為 MR 也可以運行
set hive.execution.engine=mr
select
t2.user_id
,t2.age
,COUNT(t2.user_id) over(PARTITION BY t2.user_id,t2.diff) as days
from
( SELECT
t1.user_id ,
t1.age ,
date_sub(t1.dt , t1.rn) as diff
from
(select
t.user_id ,
t.age ,
t.dt ,
ROW_NUMBER () over(PARTITION by t.user_id order by t.dt ) rn
from
(select
user_id ,
age ,
dt
from user_age group by user_id , age ,dt ) t) t1 ) t2
MR 提交的JOB全部執(zhí)行成功

image.png