全文检索
中文全文检索
- 安装中文分词插件(选择一个)
- scws简单中文分词系统 中文分词库
- zhparser
- pg_jieba
依赖
docker >=1.13
准备
启动镜像
sudo docker run -d --name pg_jieba --rm -p 5432:5432 chenxinaz/pg_jieba:10.2-alpine
进入容器
sudo docker exec -ti 容器id /bin/sh
psql -U postgres
创建支持中文字符集的数据库
-- 创建数据库,支持中文字符
create database test with template template0 lc_collate "zh_CN.UTF8" lc_ctype "zh_CN.UTF8" encoding 'UTF8';
开启拓展
-- 进入数据库
\c test
-- 开启拓展
create extension pg_trgm;
create extension pg_jieba;
创建测试数据库
-- 创建表
create table jb_test(
name varchar(20) not null default '', --默认文本
name_tsv tsvector); --分词文本
测试
创建生成汉字方法
生成汉字
create or replace function gen_hanzi(int) returns text as
$$
declare
res text;
begin
if $1 >=1 then
select string_agg(chr(19968+(random()*20901)::int), '') into res from generate_series(1,$1);
return res;
end if;
return null;
end;
$$
language plpgsql strict;
生成测试数据
-- 生成100万5个汉字
insert into jb_test select gen_hanzi(5) from generate_series(1,1000000);
使用like查询
-- 右模糊
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '国%';
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Gather (cost=1000.00..11624.33 rows=100 width=16) (actual time=0.582..28.944 rows=40 loops=1)
Workers Planned: 2
Workers Launched: 2
Buffers: shared hit=2124
-> Parallel Seq Scan on jb_test (cost=0.00..10614.33 rows=42 width=16) (actual time=2.256..25.636 rows=13 loops=3)
Filter: ((name)::text ~~ '国%'::text)
Rows Removed by Filter: 333320
Buffers: shared hit=5406
Planning time: 0.033 ms
Execution time: 32.746 ms
(10 rows)
-- 全模糊
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '%国%';
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Gather (cost=1000.00..11624.33 rows=100 width=16) (actual time=0.588..37.418 rows=232 loops=1)
Workers Planned: 2
Workers Launched: 2
Buffers: shared hit=2079
-> Parallel Seq Scan on jb_test (cost=0.00..10614.33 rows=42 width=16) (actual time=0.510..34.194 rows=77 loops=3)
Filter: ((name)::text ~~ '%国%'::text)
Rows Removed by Filter: 333256
Buffers: shared hit=5406
Planning time: 0.037 ms
Execution time: 41.077 ms
(10 rows)
创建模糊搜索索引
CREATE INDEX idx_jb_test ON jb_test USING GIST (name gist_trgm_ops);
--或者
CREATE INDEX idx_jb_test ON jb_test USING GIN (name gin_trgm_ops);
使用索引(pg_trgm)查询
test=# CREATE INDEX idx_jb_test ON jb_test USING GIN (name gin_trgm_ops);
CREATE INDEX
-- 使用到索引
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '国%';
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on jb_test (cost=20.77..381.22 rows=100 width=16) (actual time=0.163..2.431 rows=40 loops=1)
Recheck Cond: ((name)::text ~~ '国%'::text)
Heap Blocks: exact=40
Buffers: shared hit=4 read=40
-> Bitmap Index Scan on idx_jb_test (cost=0.00..20.75 rows=100 width=0) (actual time=0.025..0.025 rows=40 loops=1)
Index Cond: ((name)::text ~~ '国%'::text)
Buffers: shared hit=4
Planning time: 0.508 ms
Execution time: 2.495 ms
(9 rows)
-- 未使用到索引
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '%国%';
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Gather (cost=1000.00..11624.33 rows=100 width=16) (actual time=3.167..51.069 rows=232 loops=1)
Workers Planned: 2
Workers Launched: 2
Buffers: shared hit=19 read=1978
-> Parallel Seq Scan on jb_test (cost=0.00..10614.33 rows=42 width=16) (actual time=1.609..44.620 rows=77 loops=3)
Filter: ((name)::text ~~ '%国%'::text)
Rows Removed by Filter: 333256
Buffers: shared hit=40 read=5366
Planning time: 0.073 ms
Execution time: 53.692 ms
(10 rows)
模糊查询使用索引(pg_trgm)的要求
pg_trgm前后模糊字符个数要求:
- 有前缀的模糊查询,例如a%,至少需要提供1个字符。( 搜索的是token=’ a’ )
- 有后缀的模糊查询,例如%ab,至少需要提供2个字符。( 搜索的是token=’ab ‘ )
- 前后模糊查询,例如%abcd%,至少需要提供3个字符。( 这个使用数组搜索,搜索的是token(s) 包含 {“ a”,” ab”,abc,bcd,”cd “} )
- 因为pg_trgm生成的TOKEN是三个字符,只有在以上三个条件下,才能匹配到对应的TOKEN
test=# select show_trgm('abc');
show_trgm
-------------------------
{" a"," ab",abc,"bc "}
(1 row)
再次使用pg_trgm索引
-- 使用到索引
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '%毗娖蝯%';
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on jb_test (cost=20.77..381.22 rows=100 width=16) (actual time=0.036..0.036 rows=1 loops=1)
Recheck Cond: ((name)::text ~~ '%毗娖蝯%'::text)
Heap Blocks: exact=1
Buffers: shared hit=3 read=2
-> Bitmap Index Scan on idx_jb_test (cost=0.00..20.75 rows=100 width=0) (actual time=0.031..0.031 rows=1 loops=1)
Index Cond: ((name)::text ~~ '%毗娖蝯%'::text)
Buffers: shared hit=2 read=2
Planning time: 0.053 ms
Execution time: 0.050 ms
(9 rows)
-- 使用到索引
test=# explain (analyze,timing,buffers,costs) select name from jb_test where name like '%毗娖';
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on jb_test (cost=20.77..381.22 rows=100 width=16) (actual time=0.038..0.038 rows=0 loops=1)
Recheck Cond: ((name)::text ~~ '%毗娖'::text)
Buffers: shared hit=2 read=2
-> Bitmap Index Scan on idx_jb_test (cost=0.00..20.75 rows=100 width=0) (actual time=0.036..0.036 rows=0 loops=1)
Index Cond: ((name)::text ~~ '%毗娖'::text)
Buffers: shared hit=2 read=2
Planning time: 0.057 ms
Execution time: 0.055 ms
(8 rows)
解决汉字小于3个字符的情况
- 创建表达式
-- 分割字符函数
create or replace function split001(text) returns text[] as
$$
declare
res text[];
begin
select regexp_split_to_array($1,'') into res;
for i in 1..length($1)-1 loop
res := array_append(res, substring($1,i,2));
end loop;
return res;
end;
$$
language plpgsql strict immutable;
- 生成表达式索引
create index idx_jieba_name on jb_test using gin (split001(name));
- 使用索引
```sql
test=# explain (analyze,buffers,costs,timing) select * from jb_test where split001(name) @>split001(‘孟’);
QUERY PLAN
———————————————————————————————————————- Bitmap Heap Scan on t_trgm (cost=74.63..7588.84 rows=5500 width=15) (actual time=0.060..0.264 rows=276 loops=1) Recheck Cond: (split001(name) @> ‘{孟}’::text[]) Heap Blocks: exact=267 Buffers: shared hit=271 -> Bitmap Index Scan on idx_jieba_name (cost=0.00..73.25 rows=5500 width=0) (actual time=0.035..0.035 rows=276 loops=1) Index Cond: (split001(name) @> ‘{孟}’::text[]) Buffers: shared hit=4 Planning Time: 0.073 ms Execution Time: 0.284 ms (9 rows)
### 使用pg_jieba分词
#### 清空以上数据
```sql
-- 删除表
drop table jb_test ;
-- 创建表
create table jb_test(
name varchar(20) not null default '', --默认文本
name_tsv tsvector); --分词文本
创建分词触发插件
-- 创建触发器
create trigger nametsupdate before insert or update on jb_test for each row execute procedure
tsvector_update_trigger(name_tsv,'public.jiebacfg',name);
-- 如果有旧数据,更新
update jb_test set name_tsv = to_tsvector('jiebacfg',coalesce(name,''));
生成测试数据
-- 生成100万10个汉字
insert into jb_test select gen_hanzi(10) from generate_series(1,1000000);
使用分词查询数据
-- 搜索包含财或者务的文本
test=# explain (analyze,buffers,costs,timing) select name from jb_test where name_tsv @@ '财|务' limit 10;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------
Limit (cost=0.00..527.96 rows=10 width=31) (actual time=0.015..2.003 rows=10 loops=1)
Buffers: shared read=199
-> Seq Scan on jb_test (cost=0.00..35215.00 rows=667 width=31) (actual time=0.015..2.001 rows=10 loops=1)
Filter: (name_tsv @@ '''财'' | ''务'''::tsquery)
Rows Removed by Filter: 8714
Buffers: shared read=199
Planning time: 0.058 ms
Execution time: 2.014 ms
(8 rows)
创建索引
CREATE INDEX idx_jb_test ON jb_test USING GIN (name_tsv tsvector_ops);
使用索引查询
test=# explain (analyze,buffers,costs,timing) select name from jb_test where name_tsv @@ '财|务' limit 10;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------
Limit (cost=41.17..75.71 rows=10 width=31) (actual time=0.247..0.255 rows=10 loops=1)
Buffers: shared hit=16
-> Bitmap Heap Scan on jb_test (cost=41.17..2345.53 rows=667 width=31) (actual time=0.246..0.254 rows=10 loops=1)
Recheck Cond: (name_tsv @@ '''财'' | ''务'''::tsquery)
Heap Blocks: exact=9
Buffers: shared hit=16
-> Bitmap Index Scan on idx_jb_test (cost=0.00..41.00 rows=667 width=0) (actual time=0.163..0.163 rows=764 loops=1)
Index Cond: (name_tsv @@ '''财'' | ''务'''::tsquery)
Buffers: shared hit=7
Planning time: 0.080 ms
Execution time: 0.280 ms
(11 rows)
注意问题
- pg_jieba每次首次加载数据慢
pg_jieba.so没有放在shared_preload_libraries或session_preload_libraries中 那么每个会话启动时,都需要load pg_jieba.so,从而导致了第一次查询速度非常慢。
解决: ```bash vi postgresql.conf
shared_preload_libraries = ‘pg_jieba.so’
or
session_preload_libraries = ‘pg_jieba.so’
#重启数据库
pg_ctl restart -m fast
```