项目讲解4

流过昼夜 提交于 2020-04-05 19:57:34

分组取topN的方法:

数据的预处理为使用mapreduce

每日新访客:

回头/单次访客统计:

漏斗模型:

使用python产生数据

1.统计每个步骤的总访问人数

create table tmp_page_views like ods_weblog_origin;创建一个像 ods_weblog_origin这个表格式的表

desc  tmp_page_views;

把python生成的mylog.log数据导入

统计每个step的数量:(为什么要加这个?step1 as step)

select‘step1’ as step ,count(distinct remote_order) as number from tmp_page_views where request like '%/iterm%';

union就是把行并在一起,把表纵向加起来

把拼接的表放到route_number中

create table dw_oute_numbs as 
select 'step1' as step,count(distinct remote_addr)  as numbs from ods_click_pageviews where request like '/item%'
union
select 'step2' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where request like '/category%'
union
select 'step3' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where request like '/order%'
union
select 'step4' as step,count(distinct remote_addr)  as numbs from ods_click_pageviews where request like '/index%';

2.查询每一步骤相对于路径起点人数的比例

select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs  from route_num rn

inner join

route_num rr

select tmp.rnstep,tmp.rnnumbs/tmp.rrnumbs as ratio

from

(

select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs  from dw_oute_numbs rn

inner join

dw_oute_numbs rr) tmp

where tmp.rrstep='step1';

3.查询每一步骤相对于上一步骤的漏出率

select tmp.rrstep as rrstep,tmp.rrnumbs/tmp.rnnumbs as ration

from

(

select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs  from route_num rn

inner join

route_num rr) tmp

where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1

4.把查询的两个结果汇总到一个表中

select abs.step,abs.numbs,abs.ratio as abs_ratio,rel.ratio as rel_ratio
from 
(
select tmp.rnstep as step,tmp.rnnumbs as numbs,tmp.rnnumbs/tmp.rrnumbs as ratio
from
(
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs  from route_num rn
inner join 
route_num rr) tmp
where tmp.rrstep='step1'
) abs
left outer join
(
select tmp.rrstep as step,tmp.rrnumbs/tmp.rnnumbs as ratio
from
(
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs  from route_num rn
inner join 
route_num rr) tmp
where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1
) rel
on abs.step=rel.step

 

 

 

 

 

 

 

 

 

 

 

 

 

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!