learning_notes

学习笔记

View project on GitHub

全文检索

中文全文检索

docker_pg_jieba

  1. 安装中文分词插件(选择一个)

依赖

docker >=1.13

准备

启动镜像

sudo docker run -d --name pg_jieba --rm -p 5432:5432 chenxinaz/pg_jieba:10.2-alpine

进入容器

sudo docker exec -ti 容器id /bin/sh
psql -U postgres

创建支持中文字符集的数据库

-- 创建数据库,支持中文字符
create database test with template template0 lc_collate "zh_CN.UTF8" lc_ctype "zh_CN.UTF8" encoding 'UTF8';

开启拓展

-- 进入数据库
\c test
-- 开启拓展
create extension pg_trgm;
create extension pg_jieba;

创建测试数据库

-- 创建表
create table jb_test(                                                                         
name varchar(20) not null default '', --默认文本
name_tsv tsvector); --分词文本

测试

创建生成汉字方法

生成汉字

create or replace function gen_hanzi(int) returns text as 
$$
                 
declare        
 res text;        
begin        
 if $1 >=1 then        
   select string_agg(chr(19968+(random()*20901)::int), '') into res from generate_series(1,$1);        
   return res;        
 end if;        
 return null;        
end;        

$$
language plpgsql strict;  

生成测试数据

-- 生成100万5个汉字
insert into jb_test select gen_hanzi(5) from generate_series(1,1000000);

使用like查询

-- 右模糊
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '国%';
                                                       QUERY PLAN                                                       
------------------------------------------------------------------------------------------------------------------------
 Gather  (cost=1000.00..11624.33 rows=100 width=16) (actual time=0.582..28.944 rows=40 loops=1)
   Workers Planned: 2
   Workers Launched: 2
   Buffers: shared hit=2124
   ->  Parallel Seq Scan on jb_test  (cost=0.00..10614.33 rows=42 width=16) (actual time=2.256..25.636 rows=13 loops=3)
         Filter: ((name)::text ~~ '国%'::text)
         Rows Removed by Filter: 333320
         Buffers: shared hit=5406
 Planning time: 0.033 ms
 Execution time: 32.746 ms
(10 rows)

-- 全模糊
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '%国%';
                                                       QUERY PLAN                                                       
------------------------------------------------------------------------------------------------------------------------
 Gather  (cost=1000.00..11624.33 rows=100 width=16) (actual time=0.588..37.418 rows=232 loops=1)
   Workers Planned: 2
   Workers Launched: 2
   Buffers: shared hit=2079
   ->  Parallel Seq Scan on jb_test  (cost=0.00..10614.33 rows=42 width=16) (actual time=0.510..34.194 rows=77 loops=3)
         Filter: ((name)::text ~~ '%国%'::text)
         Rows Removed by Filter: 333256
         Buffers: shared hit=5406
 Planning time: 0.037 ms
 Execution time: 41.077 ms
(10 rows)

创建模糊搜索索引

CREATE INDEX idx_jb_test ON jb_test USING GIST (name gist_trgm_ops);
--或者
CREATE INDEX idx_jb_test ON jb_test USING GIN (name gin_trgm_ops);

使用索引(pg_trgm)查询

test=# CREATE INDEX idx_jb_test ON jb_test USING GIN (name gin_trgm_ops);
CREATE INDEX
-- 使用到索引
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '国%';
                                                       QUERY PLAN                                                       
------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on jb_test  (cost=20.77..381.22 rows=100 width=16) (actual time=0.163..2.431 rows=40 loops=1)
   Recheck Cond: ((name)::text ~~ '国%'::text)
   Heap Blocks: exact=40
   Buffers: shared hit=4 read=40
   ->  Bitmap Index Scan on idx_jb_test  (cost=0.00..20.75 rows=100 width=0) (actual time=0.025..0.025 rows=40 loops=1)
         Index Cond: ((name)::text ~~ '国%'::text)
         Buffers: shared hit=4
 Planning time: 0.508 ms
 Execution time: 2.495 ms
(9 rows)

-- 未使用到索引
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '%国%';
                                                       QUERY PLAN                                                       
------------------------------------------------------------------------------------------------------------------------
 Gather  (cost=1000.00..11624.33 rows=100 width=16) (actual time=3.167..51.069 rows=232 loops=1)
   Workers Planned: 2
   Workers Launched: 2
   Buffers: shared hit=19 read=1978
   ->  Parallel Seq Scan on jb_test  (cost=0.00..10614.33 rows=42 width=16) (actual time=1.609..44.620 rows=77 loops=3)
         Filter: ((name)::text ~~ '%国%'::text)
         Rows Removed by Filter: 333256
         Buffers: shared hit=40 read=5366
 Planning time: 0.073 ms
 Execution time: 53.692 ms
(10 rows)

模糊查询使用索引(pg_trgm)的要求

pg_trgm前后模糊字符个数要求:

  • 有前缀的模糊查询,例如a%,至少需要提供1个字符。( 搜索的是token=’ a’ )
  • 有后缀的模糊查询,例如%ab,至少需要提供2个字符。( 搜索的是token=’ab ‘ )
  • 前后模糊查询,例如%abcd%,至少需要提供3个字符。( 这个使用数组搜索,搜索的是token(s) 包含 {“ a”,” ab”,abc,bcd,”cd “} )
  • 因为pg_trgm生成的TOKEN是三个字符,只有在以上三个条件下,才能匹配到对应的TOKEN
test=# select show_trgm('abc');
        show_trgm        
-------------------------
 {"  a"," ab",abc,"bc "}
(1 row)

再次使用pg_trgm索引

-- 使用到索引
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '%毗娖蝯%';
                                                      QUERY PLAN                                                       
-----------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on jb_test  (cost=20.77..381.22 rows=100 width=16) (actual time=0.036..0.036 rows=1 loops=1)
   Recheck Cond: ((name)::text ~~ '%毗娖蝯%'::text)
   Heap Blocks: exact=1
   Buffers: shared hit=3 read=2
   ->  Bitmap Index Scan on idx_jb_test  (cost=0.00..20.75 rows=100 width=0) (actual time=0.031..0.031 rows=1 loops=1)
         Index Cond: ((name)::text ~~ '%毗娖蝯%'::text)
         Buffers: shared hit=2 read=2
 Planning time: 0.053 ms
 Execution time: 0.050 ms
(9 rows)

-- 使用到索引
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '%毗娖';
                                                      QUERY PLAN                                                       
-----------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on jb_test  (cost=20.77..381.22 rows=100 width=16) (actual time=0.038..0.038 rows=0 loops=1)
   Recheck Cond: ((name)::text ~~ '%毗娖'::text)
   Buffers: shared hit=2 read=2
   ->  Bitmap Index Scan on idx_jb_test  (cost=0.00..20.75 rows=100 width=0) (actual time=0.036..0.036 rows=0 loops=1)
         Index Cond: ((name)::text ~~ '%毗娖'::text)
         Buffers: shared hit=2 read=2
 Planning time: 0.057 ms
 Execution time: 0.055 ms
(8 rows)


解决汉字小于3个字符的情况

  1. 创建表达式
-- 分割字符函数
create or replace function split001(text) returns text[] as 
$$
     
declare      
 res text[];      
begin      
 select regexp_split_to_array($1,'') into res;      
 for i in 1..length($1)-1 loop      
   res := array_append(res, substring($1,i,2));      
 end loop;      
 return res;      
end;      

$$
language plpgsql strict immutable;

  1. 生成表达式索引
    create index idx_jieba_name on jb_test using gin (split001(name));
    
  2. 使用索引 ```sql test=# explain (analyze,buffers,costs,timing) select * from jb_test where split001(name) @>split001(‘孟’); QUERY PLAN
    ———————————————————————————————————————- Bitmap Heap Scan on t_trgm (cost=74.63..7588.84 rows=5500 width=15) (actual time=0.060..0.264 rows=276 loops=1) Recheck Cond: (split001(name) @> ‘{孟}’::text[]) Heap Blocks: exact=267 Buffers: shared hit=271 -> Bitmap Index Scan on idx_jieba_name (cost=0.00..73.25 rows=5500 width=0) (actual time=0.035..0.035 rows=276 loops=1) Index Cond: (split001(name) @> ‘{孟}’::text[]) Buffers: shared hit=4 Planning Time: 0.073 ms Execution Time: 0.284 ms (9 rows)

### 使用pg_jieba分词

#### 清空以上数据

```sql
-- 删除表
drop table jb_test ;
-- 创建表
create table jb_test(                                                                         
name varchar(20) not null default '', --默认文本
name_tsv tsvector); --分词文本

创建分词触发插件

-- 创建触发器
create trigger nametsupdate before insert or update on jb_test for each row execute procedure 
tsvector_update_trigger(name_tsv,'public.jiebacfg',name);

-- 如果有旧数据,更新
update jb_test set name_tsv = to_tsvector('jiebacfg',coalesce(name,''));

生成测试数据

-- 生成100万10个汉字
insert into jb_test select gen_hanzi(10) from generate_series(1,1000000);

使用分词查询数据

-- 搜索包含财或者务的文本
test=# explain (analyze,buffers,costs,timing) select name from jb_test where name_tsv @@ '财|务' limit 10;
                                                  QUERY PLAN                                                   
---------------------------------------------------------------------------------------------------------------
 Limit  (cost=0.00..527.96 rows=10 width=31) (actual time=0.015..2.003 rows=10 loops=1)
   Buffers: shared read=199
   ->  Seq Scan on jb_test  (cost=0.00..35215.00 rows=667 width=31) (actual time=0.015..2.001 rows=10 loops=1)
         Filter: (name_tsv @@ ''''' | '''''::tsquery)
         Rows Removed by Filter: 8714
         Buffers: shared read=199
 Planning time: 0.058 ms
 Execution time: 2.014 ms
(8 rows)

创建索引

CREATE INDEX idx_jb_test ON jb_test USING GIN (name_tsv tsvector_ops);

使用索引查询

test=# explain (analyze,buffers,costs,timing) select name from jb_test where name_tsv @@ '财|务' limit 10;
                                                          QUERY PLAN                                                           
-------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=41.17..75.71 rows=10 width=31) (actual time=0.247..0.255 rows=10 loops=1)
   Buffers: shared hit=16
   ->  Bitmap Heap Scan on jb_test  (cost=41.17..2345.53 rows=667 width=31) (actual time=0.246..0.254 rows=10 loops=1)
         Recheck Cond: (name_tsv @@ ''''' | '''''::tsquery)
         Heap Blocks: exact=9
         Buffers: shared hit=16
         ->  Bitmap Index Scan on idx_jb_test  (cost=0.00..41.00 rows=667 width=0) (actual time=0.163..0.163 rows=764 loops=1)
               Index Cond: (name_tsv @@ ''''' | '''''::tsquery)
               Buffers: shared hit=7
 Planning time: 0.080 ms
 Execution time: 0.280 ms
(11 rows)


注意问题

  1. pg_jieba每次首次加载数据慢
    pg_jieba.so没有放在shared_preload_libraries或session_preload_libraries中 那么每个会话启动时,都需要load pg_jieba.so,从而导致了第一次查询速度非常慢。
    

    解决: ```bash vi postgresql.conf

shared_preload_libraries = ‘pg_jieba.so’
or
session_preload_libraries = ‘pg_jieba.so’ #重启数据库 pg_ctl restart -m fast

```