了解那些“奇葩”SQL写法,快速写出高效率SQL
阿里妹导读
本文主要讲解常见的SQL开发场景、‘奇葩’SQL写法并深入执行计划,带你了解如何快速写出高效率SQL。
背景
高效写法
union直接使用效率低吗?
场景介绍
写法&执行计划探查
SELECT cst_id,cst_info
FROM (
SELECT cst_id,cst_info
FROM @cst_info_a
WHERE dt = '${bizdate}'
UNION
SELECT cst_id,cst_info
FROM cst_info_b
WHERE dt = '${bizdate}'
)cst_info
;
这种情况下,会理解为先将两两份数据不做任务处理就合并在一起,导致shuffle、中间临时写入的数据量和读取数据量和数据源都是一致的,然后再去做去重。因为数据量在中间过程没有没有减少,所以效率相对来说会低一些。现在来看一下执行计划:
-- 方式一
SELECT cst_id,cst_info
FROM (
SELECT cst_id,cst_info
FROM @cst_info_a
WHERE dt = '${bizdate}'
GROUP BY cst_id,cst_info
UNION
SELECT cst_id,cst_info
FROM @cst_info_b
WHERE dt = '${bizdate}'
GROUP BY cst_id,cst_info
)cst_info;
--方式二
SELECT cst_id,cst_info
FROM (
SELECT cst_id,cst_info
FROM @cst_info_a
WHERE dt = '${bizdate}'
GROUP BY cst_id,cst_info
UNION ALL
SELECT cst_id,cst_info
FROM @cst_info_b
WHERE dt = '${bizdate}'
GROUP BY cst_id,cst_info
)cst_info
GROUP BY
cst_id,cst_info;
两种写法的执行计划一致,如下:
总结
count distinct真的慢吗?
场景介绍
写法&执行计划探查
--选择近5天的资产来看
--常见写法,count distinct写法
SELECT
COUNT(DISTINCT cst_id) AS cst_cnt
FROM @pc_bill_bal
WHERE dt BETWEEN '${bizdate-5}' AND '${bizdate}'
;
--优化写法
SELECT COUNT(1) AS cst_cnt
FROM (
SELECT
cst_id
FROM @pc_bill_bal
WHERE dt BETWEEN '${bizdate-5}' AND '${bizdate}'
GROUP BY
cst_id
)base
;
一般都会认为直接count distinct效率很低,是这样吗?接下来看一下两个执行计划对比
--选择近5天的资产来看
--常见写法,count distinct写法
SELECT
dt
,COUNT(DISTINCT cst_id) AS cst_cnt
FROM @pc_bill_bal
WHERE dt BETWEEN '${bizdate-5}' AND '${bizdate}'
GROUP BY
dt
;
--优化写法
SELECT
dt
,COUNT(cst_id) AS cst_cnt
FROM (
SELECT
dt
,cst_id
FROM @pc_bill_bal
WHERE dt BETWEEN '${bizdate-5}' AND '${bizdate}'
GROUP BY
dt
,cst_id
)base
GROUP BY
dt
;
看一下这种场景下两种执行计划对比
总结
多张大表join提速(聚合类型)
场景介绍
写法&执行计划探查
-- 举例为资产池得到每个用户的所有资产
-- 使用full outer join + coalesce的写法
SELECT
COALESCE(tt1.cst_id, tt2.cst_id) as cst_id
,COALESCE(tt1.bal_init_prin, 0) AS bal_init_prin
,COALESCE(tt1.amt_retail_prin, 0) AS amt_retail_prin
,COALESCE(tt2.amt_buy_prin, 0) AS amt_buy_prin
FROM (
SELECT
COALESCE(t1.cst_id, t2.cst_id) as cst_id
,COALESCE(t1.bal_init_prin, 0) AS bal_init_prin
,COALESCE(t2.amt_retail_prin, 0) AS amt_retail_prin
FROM @bal_init t1 -- 日初资产
FULL OUTER JOIN @amt_retail t2 -- 当天放款资产
ON t1.cst_id = t2.cst_id
)tt1
FULL OUTER JOIN @amt_buy tt2 -- 当天买入资产
ON tt1.cst_id = tt2.cst_id
;
接下来看优化写法:
-- 写法一
SELECT
cst_id
,SUM(bal_init_prin) as bal_init_prin
,SUM(amt_retail_prin) as amt_retail_prin
,SUM(amt_buy_prin) as amt_buy_prin
FROM (
SELECT cst_id, bal_init_prin, 0 AS amt_retail_prin, 0 AS amt_buy_prin
FROM @bal_init -- 日初资产
union ALL
SELECT cst_id, 0 AS bal_init_prin, amt_retail_prin, 0 AS amt_buy_prin
FROM @amt_retail -- 当天放款资产
UNION ALL
SELECT cst_id, 0 AS bal_init_prin, 0 AS amt_retail_prin, amt_buy_prin
FROM @amt_buy -- 当天买入资产
)t1
GROUP BY
cst_id
;
-- 优化写法二
SELECT
cst_id
,SUM(IF(flag = 1, prin, 0)) as bal_init_prin
,SUM(IF(flag = 2, prin, 0)) as amt_retail_prin
,SUM(IF(flag = 3, prin, 0)) as amt_buy_prin
FROM (
SELECT cst_id, bal_init_prin AS prin, 1 AS flag
FROM @bal_init -- 日初资产
union ALL
SELECT cst_id, amt_retail_prin AS prin, 2 AS flag
FROM @amt_retail -- 当天放款资产
UNION ALL
SELECT cst_id, amt_buy_prin AS prin, 3 AS flag
FROM @amt_buy -- 当天买入资产
)t1
GROUP BY
cst_id
;
对比join写法和优化写法的执行计划(这两个执行计划内部做的事情和任务名称理解一致,就不展开看了)
总结
多张大表join提速(字符串类型)
场景介绍
写法&执行计划探查
-- 本案例和上边案例类似,使用先将主体合并在一起,再使用三次left join
SELECT
base.cst_id AS cst_id
,t1.bal_init_prin AS bal_init_prin
,t2.amt_retail_prin AS amt_retail_prin
,t3.amt_buy_prin AS amt_buy_prin
FROM (
SELECT
cst_id
FROM @bal_init -- 日初资产
UNION
SELECT
cst_id
FROM @amt_retail -- 当天放款资产
UNION
SELECT
cst_id
FROM @amt_buy -- 当天买入资产
)base
LEFT JOIN @bal_init t1 -- 日初资产
ON base.cst_id = t1.cst_id
LEFT JOIN @amt_retail t2 -- 当天放款资产
ON base.cst_id = t2.cst_id
LEFT JOIN @amt_buy t3 -- 当天买入资产
ON base.cst_id = t3.cst_id
;
接下来看优化写法:
-- STRING数据类型利用json来实现
SELECT
cst_id
,GET_JSON_OBJECT(all_val, '$.bal_init_prin') AS bal_init_prin
,GET_JSON_OBJECT(all_val, '$.amt_retail_prin') AS amt_retail_prin
,GET_JSON_OBJECT(all_val, '$.amt_buy_prin') AS amt_buy_prin
FROM (
SELECT
cst_id
,CONCAT('{',CONCAT_WS(',', COLLECT_SET(all_val)) , '}') AS all_val
FROM (
SELECT
cst_id
,CONCAT('\"bal_init_prin\":\"', bal_init_prin, '\"') AS all_val
FROM @bal_init -- 日初资产
UNION ALL
SELECT
cst_id
,CONCAT('\"amt_retail_prin\":\"', amt_retail_prin, '\"') AS all_val
FROM @amt_retail -- 当天放款资产
UNION ALL
SELECT
cst_id
,CONCAT('\"amt_buy_prin\":\"', amt_buy_prin, '\"') AS all_val
FROM @amt_buy -- 当天买入资产
)t1
GROUP BY
cst_id
)tt1
;
对比join写法和优化写法的执行计划
总结
mapjoin为什么快?是否生效了?
场景介绍
写法&执行计划探查
-- base为大表,fee_year_rate为小表
-- 方式一,inner join
SELECT
base.*
,fee_year_rate.*
FROM @base base
INNER JOIN @fee_year_rate fee_year_rate
ON (base.terms = fee_year_rate.terms)
;
-- 方式一,LEFT join
SELECT
base.*
,fee_year_rate.*
FROM @base base
LEFT JOIN @fee_year_rate fee_year_rate
ON (base.terms = fee_year_rate.terms)
;
-- 方式三,right join
SELECT
base.*
,fee_year_rate.*
FROM @base base
RIGHT JOIN @fee_year_rate fee_year_rate
ON (base.terms = fee_year_rate.terms)
;
-- 方式四, full outer join
SELECT
base.*
,fee_year_rate.*
FROM @base base
FULL OUTER JOIN @fee_year_rate fee_year_rate
ON (base.terms = fee_year_rate.terms)
;
对比一下执行计划
总结
distmapjoin:加强版mapjoin
场景介绍
写法&执行计划探查
SELECT
base.*
,cst_info.*
FROM @base base
LEFT JOIN @cst_info cst_info
ON (base.cst_id = cst_info.cst_id
AND base.origin_inst_code = cst_info.inst_id)
;
优化写法:
SELECT /*+distmapjoin(cst_info(shard_count=20))*/
base.*
,cst_info.*
FROM @base base
LEFT JOIN @cst_info cst_info
ON (base.cst_id = cst_info.cst_id
AND base.origin_inst_code = cst_info.inst_id)
;
对比执行计划
总结
where限制条件写在外层会很慢吗?
场景介绍
写法&执行计划探查
-- 规范写法
SELECT
base.*
,fee_year_rate.*
FROM (
SELECT *
FROM @base
where terms = '12'
)base
INNER JOIN @fee_year_rate fee_year_rate
ON (base.terms = fee_year_rate.terms)
;
-- 非规范写法
SELECT
base.*
,fee_year_rate.*
FROM @base base
INNER JOIN @fee_year_rate fee_year_rate
ON (base.terms = fee_year_rate.terms)
WHERE base.terms = '12'
;
印象中,规范写法的运行效率肯定会高一些,看一下执行计划会发现两种写法的执行计划是一样的,都在join之前做了过滤
总结
总结
微信扫码关注该文公众号作者
戳这里提交新闻线索和高质量文章给我们。
来源: qq
点击查看作者最近其他文章