数据库存储为tsvector字段类型,
insert into rum_test(content) values(to_tsvector('simple', '11320272,11254479,11122893,11122893,11188686'));
插入结果:
'11122893':3,4 '11188686':5 '11254479':2 '11320272':1
现在,我用相同的数据去查询,并按相似度排序,结果是这样?不是100%?
select id, content <=> to_tsquery('11320272,11254479,11122893,11122893,11188686') as rank from rum_test
结果:
{"id":10,"rank":3.28987}
HI,tsvector,tsquery 的相似度计算用到的这个函数。
Datum
rum_ts_distance_tt(PG_FUNCTION_ARGS)
{
TSVector txt = PG_GETARG_TSVECTOR(0);
TSQuery query = PG_GETARG_TSQUERY(1);
float4 res;
res = calc_score(weights, txt, query, DEF_NORM_METHOD);
PG_FREE_IF_COPY(txt, 0);
PG_FREE_IF_COPY(query, 1);
if (res == 0)
PG_RETURN_FLOAT4(get_float4_infinity());
else
PG_RETURN_FLOAT4(1.0 / res);
}
static float4
calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method)
{
DocRepresentation *doc;
uint32 len,
doclen = 0;
double Wdoc = 0.0;
QueryRepresentation qr;
qr.query = query;
qr.map_item_operand = NULL;
qr.operandexist = (bool *) palloc0(sizeof(bool) * query->size);
qr.lenght = query->size;
doc = get_docrep(txt, &qr, &doclen);
if (!doc)
{
pfree(qr.operandexist);
return 0.0;
}
Wdoc = calc_score_docr(arrdata, doc, doclen, &qr, method);
if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
Wdoc /= log((double) (count_length(txt) + 1));
if (method & RANK_NORM_LENGTH)
{
len = count_length(txt);
if (len > 0)
Wdoc /= (double) len;
}
if ((method & RANK_NORM_UNIQ) && txt->size > 0)
Wdoc /= (double) (txt->size);
if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
Wdoc /= log((double) (txt->size + 1)) / log(2.0);
pfree(doc);
pfree(qr.operandexist);
return (float4) Wdoc;
}
版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。