citus系列7-topn加速(count()groupbyorderbycount()de

背景

count(*) group by order by count(*) desc limit x 用來統(tǒng)計 topn。

成都創(chuàng)新互聯(lián)公司服務(wù)項目包括青白江網(wǎng)站建設(shè)、青白江網(wǎng)站制作、青白江網(wǎng)頁制作以及青白江網(wǎng)絡(luò)營銷策劃等。多年來，我們專注于互聯(lián)網(wǎng)行業(yè)，利用自身積累的技術(shù)優(yōu)勢、行業(yè)經(jīng)驗、深度合作伙伴關(guān)系等，向廣大中小型企業(yè)、政府機構(gòu)等提供互聯(lián)網(wǎng)行業(yè)的解決方案，青白江網(wǎng)站推廣取得了明顯的社會效益與經(jīng)濟效益。目前，我們服務(wù)的客戶以成都為中心已經(jīng)輻射到青白江省份的部分城市，未來相信會繼續(xù)擴大服務(wù)區(qū)域并繼續(xù)獲得客戶的支持與信任！

topn是運營的重要指標，比如排行前10的活躍用戶。

如果數(shù)據(jù)量非常龐大，統(tǒng)計會比較耗時，citus提供了一個topn插件，與HLL類似，核心是使用少量空間存儲聚合過程中的數(shù)據(jù)，同時返回一個固定大?。▍?shù)設(shè)置topn.number_of_counters）的JSONB，可用于下次聚合。(注意，PostgreSQL 11支持更強大的hashagg parallel后，聚合大數(shù)據(jù)量已不是問題)

topn插件聚合過程如圖。

citus 系列7 - topn 加速(count(*) group by order by count(*) de

對topn的結(jié)果使用topn_union_agg可以再次聚合。

postgres=# \df topn*  
                              List of functions  
 Schema |       Name       | Result data type  | Argument data types |  Type    
--------+------------------+-------------------+---------------------+--------  
 public | topn             | SETOF topn_record | jsonb, integer      | normal  
 public | topn_add         | jsonb             | jsonb, text         | normal  
 public | topn_add_agg     | jsonb             | text                | agg  
 public | topn_add_trans   | internal          | internal, text      | normal  
 public | topn_pack        | jsonb             | internal            | normal  
 public | topn_union       | jsonb             | jsonb, jsonb        | normal  
 public | topn_union_agg   | jsonb             | jsonb               | agg  
 public | topn_union_trans | internal          | internal, jsonb     | normal  
(8 rows)

-- starting from nothing, record that we saw an "a"  
select topn_add('{}', 'a');  
-- => {"a": 1}  
  
-- record the sighting of another "a"  
select topn_add(topn_add('{}', 'a'), 'a');  
-- => {"a": 2}  
  
-- for normal_rand  
create extension tablefunc;  
  
-- count values from a normal distribution  
SELECT topn_add_agg(floor(abs(i))::text)  
  FROM normal_rand(1000, 5, 0.7) i;  
-- => {"2": 1, "3": 74, "4": 420, "5": 425, "6": 77, "7": 3}

從topn jsonb中直接獲取topn的值

postgres=# select  (topn(topn_union_agg(agg_prodid),5)).* from reviews_by_prodid;  
  item  | frequency   
--------+-----------  
 509594 |        66  
 497599 |        59  
 505217 |        58  
 461257 |        58  
 403111 |        57  
(5 rows)

使用topn

1、所有節(jié)點（包括coordinator, worker）安裝topn軟件

cd ~   
. /var/lib/pgsql/.bash_profile  
git clone https://github.com/citusdata/postgresql-topn  
cd postgresql-topn  
USE_PGXS=1 make  
USE_PGXS=1 make install

2、安裝插件(coordinator)

postgres=# create extension topn;  
CREATE EXTENSION

3、安裝插件(worker)，在coordinator中調(diào)用run_command_on_workers，在所有worker中執(zhí)行。

postgres=# select run_command_on_workers('create extension topn;');  
           run_command_on_workers             
--------------------------------------------  
 (xxx.xxx.xxx.224,1921,t,"CREATE EXTENSION")  
 (xxx.xxx.xxx.225,1921,t,"CREATE EXTENSION")  
 (xxx.xxx.xxx.226,1921,t,"CREATE EXTENSION")  
 (xxx.xxx.xxx.227,1921,t,"CREATE EXTENSION")  
 (xxx.xxx.xxx.229,1921,t,"CREATE EXTENSION")  
 (xxx.xxx.xxx.230,1921,t,"CREATE EXTENSION")  
 (xxx.xxx.xxx.231,1921,t,"CREATE EXTENSION")  
 (xxx.xxx.xxx.232,1921,t,"CREATE EXTENSION")  
(8 rows)

測試

1、測試表

create table tbl(id serial8,gid int, prodid int, c1 int, c2 int);  
  
postgres=# \d tbl  
                            Table "public.tbl"  
 Column |  Type   | Collation | Nullable |             Default               
--------+---------+-----------+----------+---------------------------------  
 id     | bigint  |           | not null | nextval('tbl_id_seq'::regclass)  
 gid    | integer |           |          |   
 prodid | integer |           |          |   
 c1     | integer |           |          |   
 c2     | integer |           |          |   
  
postgres=# alter sequence tbl_id_seq cache 10000;  
ALTER SEQUENCE

2、寫入2億測試數(shù)據(jù)

vi test.sql  
\set gid random_gaussian(1,1000,2.5)  
\set prodid random_gaussian(1,1000000,2.5)  
\set c1 random(1,3000)  
\set c2 random(1,100000000)  
insert into tbl(gid,prodid,c1,c2) values (:gid,:prodid,:c1,:c2);  
  
pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 1200

postgres=# select count(*) from tbl;  
   count     
-----------  
 216524755  
(1 row)  
  
Time: 421.860 ms

3、幾組真實的TOPN數(shù)據(jù)

postgres=# select gid,count(*) from tbl group by gid order by count(*) desc limit 10;  
 gid | count    
-----+--------  
 494 | 438102  
 499 | 438017  
 514 | 437929  
 506 | 437852  
 511 | 437546  
 509 | 437469  
 495 | 437458  
 490 | 437320  
 496 | 437257  
 500 | 437239  
(10 rows)  
  
postgres=# select c1,count(*) from tbl group by c1 order by count(*) desc limit 10;  
  c1  | count   
------+-------  
 1370 | 73175  
  168 | 73121  
 1016 | 73114  
 1816 | 73045  
 1463 | 73020  
  585 | 72986  
 1529 | 72974  
 1857 | 72944  
 2580 | 72930  
  298 | 72917  
(10 rows)  
  
postgres=# select prodid,count(*) from tbl group by prodid order by count(*) desc limit 10;  
 prodid | count   
--------+-------  
 516916 |   534  
 481914 |   534  
 520680 |   527  
 530544 |   526  
 449685 |   523  
 493560 |   523  
 520464 |   523  
 502098 |   522  
 495170 |   522  
 501695 |   522  
(10 rows)

4、gid維度估值topn (gid唯一值個數(shù)小于等于參數(shù) topn.number_of_counters )

結(jié)果精準

CREATE TABLE reviews_by_gid  
(    
  agg jsonb  
);  
  
SELECT create_reference_table('reviews_by_gid');  
  
INSERT INTO reviews_by_gid  
  SELECT topn_add_agg(gid::text)  
  FROM tbl;  
  
postgres=# select  (topn(agg,5)).* from reviews_by_gid;  
 item | frequency   
------+-----------  
 494  |    438102  
 499  |    438017  
 514  |    437929  
 506  |    437852  
 511  |    437546  
(5 rows)

5、prodid維度估值topn (prodid唯一值個數(shù)遠遠大于等于參數(shù) topn.number_of_counters )

結(jié)果偏差非常大。

CREATE TABLE reviews_by_prodid  
(  
  agg_prodid jsonb  
);  
  
SELECT create_reference_table('reviews_by_prodid');  
  
INSERT INTO reviews_by_prodid  
  SELECT topn_add_agg(prodid::text)  
  FROM tbl;  
  
  
postgres=# select  (topn(agg_prodid,5)).* from reviews_by_prodid;  
  item  | frequency   
--------+-----------  
 470098 |        36  
 531880 |        35  
 451724 |        34  
 420093 |        34  
 522676 |        33  
(5 rows)

6、c1維度估值topn (c1唯一值個數(shù)略大于等于參數(shù) topn.number_of_counters )

結(jié)果不精準。

CREATE TABLE reviews_by_c1  
(  
  aggc1 jsonb  
);  
  
SELECT create_reference_table('reviews_by_c1');  
  
INSERT INTO reviews_by_c1  
  SELECT topn_add_agg(c1::text)  
  FROM tbl;  
  
  
postgres=# select  (topn(aggc1,5)).* from reviews_by_c1;  
 item | frequency   
------+-----------  
 2580 |     37073  
 1016 |     36162  
 1983 |     35311  
 1752 |     35285  
 2354 |     34740  
(5 rows)

精度、截斷

造成以上精準度偏差的原因：

當(dāng)topn hashtable已滿，有新值寫入時，會導(dǎo)致清除hashtable中一半的元素（item, count）pairs(指按count排序后，較小的一半)。

The TopN approximation algorithm keeps a predefined number of frequent items and counters. If a new item already exists among these frequent items, the algorithm increases the item's frequency counter. Else, the algorithm inserts the new item into the counter list when there is enough space. If there isn't enough space, the algorithm evicts the bottom half of all counters. Since we typically keep counters for many more items (e.g. 100*N) than we are actually interested in, the actual top N items are unlikely to get evicted and will typically have accurate counts.

You can increase the algoritm's accuracy by increasing the predefined number of frequent items/counters.

對應(yīng)代碼

/*  
 * PruneHashTable removes some items from the HashTable to decrease its size. It finds  
 * minimum and maximum frequencies first and removes the items which have lower frequency  
 * than the average of them.  
 */  
static void  
PruneHashTable(HTAB *hashTable, int itemLimit, int numberOfRemainingElements)  
{  
	Size topnArraySize = 0;  
	int topnIndex = 0;  
	FrequentTopnItem *sortedTopnArray = NULL;  
	bool itemAlreadyHashed = false;  
	HASH_SEQ_STATUS status;  
	FrequentTopnItem *currentTask = NULL;  
	FrequentTopnItem *frequentTopnItem = NULL;  
	int index = 0;  
	int hashTableSize = hash_get_num_entries(hashTable);  
  
	if (hashTableSize <= itemLimit)  
	{  
		return;  
	}  
  
	/* create an array to copy top-n items and sort them later */  
	topnArraySize = sizeof(FrequentTopnItem) * hashTableSize;  
	sortedTopnArray = (FrequentTopnItem *) palloc0(topnArraySize);  
  
	hash_seq_init(&status, hashTable);  
  
	while ((currentTask = (FrequentTopnItem *) hash_seq_search(&status)) != NULL)  
	{  
		frequentTopnItem = palloc0(sizeof(FrequentTopnItem));  
		memcpy(frequentTopnItem->key, currentTask->key,  
			   sizeof(frequentTopnItem->key));  
		frequentTopnItem->frequency = currentTask->frequency;  
		sortedTopnArray[topnIndex] = *frequentTopnItem;  
  
		topnIndex++;  
	}  
  
	qsort(sortedTopnArray, hashTableSize, sizeof(FrequentTopnItem),  
		  compareFrequentTopnItem);  
  
	for (index = numberOfRemainingElements; index < hashTableSize; index++)  
	{  
		FrequentTopnItem *topnItem = &(sortedTopnArray[index]);  
  
		hash_search(hashTable, (void *) topnItem->key, HASH_REMOVE,  
					&itemAlreadyHashed);  
	}  
}

如何修改hash table size

postgres=# load 'topn';  
LOAD  
postgres=# show topn.number_of_counters ;  
 topn.number_of_counters   
-------------------------  
 1000  
(1 row)  
  
set topn.number_of_counters =20000;

需要在所有節(jié)點（coordinator+worker）操作，例如。

postgresql.conf  
  
shared_preload_libraries='citus,topn,pg_stat_statements'  
topn.number_of_counters=10000

小結(jié)

最佳實踐

1、建議階段性聚合，并且保證每個階段被聚合的字段，唯一值個數(shù)小于 topn.number_of_counters ，否則會失真。

例如每小時有1萬個活躍用戶，那么 topn.number_of_counters ，建議設(shè)置為1萬或更大，并且按小時聚合。每個小時存一個聚合后的jsonb結(jié)果。需要統(tǒng)計天的結(jié)果時，再將全天的jsonb進行聚合。

2、元素個數(shù)大于 topn.number_of_counters 時，會導(dǎo)致topn結(jié)果失真。

參考

https://github.com/citusdata/postgresql-topn

https://docs.citusdata.com/en/v7.5/develop/reference_sql.html

《PostgreSQL count-min sketch top-n 概率計算插件 cms_topn (結(jié)合窗口實現(xiàn)同比、環(huán)比、滑窗分析等) - 流計算核心功能之一》

原文地址：https://github.com/digoal/blog/blob/master/201809/20180914_01.md

網(wǎng)站欄目：citus系列7-topn加速(count(*)groupbyorderbycount(*)de
標題URL：http://jinyejixie.com/article38/ijcdpp.html

成都網(wǎng)站建設(shè)公司_創(chuàng)新互聯(lián)，為您提供網(wǎng)站設(shè)計、搜索引擎優(yōu)化、Google、小程序開發(fā)、網(wǎng)站策劃、網(wǎng)頁設(shè)計公司

聲明：本網(wǎng)站發(fā)布的內(nèi)容（圖片、視頻和文字）以用戶投稿、用戶轉(zhuǎn)載內(nèi)容為主，如果涉及侵權(quán)請盡快告知，我們將會在第一時間刪除。文章觀點不代表本網(wǎng)站立場，如需處理請聯(lián)系客服。電話：028-86922220；郵箱：631063699@qq.com。內(nèi)容未經(jīng)允許不得轉(zhuǎn)載，或轉(zhuǎn)載時需注明來源：創(chuàng)新互聯(lián)

猜你還喜歡下面的內(nèi)容

成人午夜视频全免费观看高清-秋霞福利视频一区二区三区-国产精品久久久久电影小说-亚洲不卡区三一区三区一区

citus系列7-topn加速(count(*)groupbyorderbycount(*)de

背景