当前位置：移动技术网 > IT编程>开发语言>.net > .NET下文本相似度算法余弦定理和SimHash浅析及应用

.NET下文本相似度算法余弦定理和SimHash浅析及应用

2018年09月13日 | 移动技术网IT编程 | 我要评论

电视剧真假千金,地产女老板全集下载,liujo

余弦相似性

原理：首先我们先把两段文本分词，列出来所有单词，其次我们计算每个词语的词频，最后把词语转换为向量，这样我们就只需要计算两个向量的相似程度.

我们简单表述如下

文本1：我/爱/北京/天安门/ 经过分词求词频得出向量（伪向量） [1,1,1,1]

文本2：我们/都爱/北京/天安门/ 经过分词求词频得出向量（伪向量） [1,0,1,2]

我们可以把它们想象成空间中的两条线段，都是从原点（[0, 0, ...]）出发，指向不同的方向。两条线段之间形成一个夹角，如果夹角为0度，意味着方向相同、线段重合；如果夹角为90度，意味着形成直角，方向完全不相似；如果夹角为180度，意味着方向正好相反。因此，我们可以通过夹角的大小，来判断向量的相似程度。夹角越小，就代表越相似。

c#核心算法

public class tfidfmeasure

{

private string[] _docs;

private string[][] _ngramdoc;

private int _numdocs=0;

private int _numterms=0;

private arraylist _terms;

private int[][] _termfreq;

private float[][] _termweight;

private int[] _maxtermfreq;

private int[] _docfreq;

public class termvector

{

public static float computecosinesimilarity(float[] vector1, float[] vector2)

{

if (vector1.length != vector2.length)

throw new exception("difer length");

float denom=(vectorlength(vector1) * vectorlength(vector2));

if (denom == 0f)

return 0f;

else

return (innerproduct(vector1, vector2) / denom);

}

public static float innerproduct(float[] vector1, float[] vector2)

{

if (vector1.length != vector2.length)

throw new exception("differ length are not allowed");

float result=0f;

for (int i=0; i < vector1.length; i++)

result += vector1[i] * vector2[i];

return result;

}

public static float vectorlength(float[] vector)

{

float sum=0.0f;

for (int i=0; i < vector.length; i++)

sum=sum + (vector[i] * vector[i]);

return (float)math.sqrt(sum);

}

private idictionary _wordsindex=new hashtable() ;

public tfidfmeasure(string[] documents)

{

_docs=documents;

_numdocs=documents.length ;

myinit();

}

private void generatngramtext()

{

}

private arraylist generateterms(string[] docs)

{

arraylist uniques=new arraylist() ;

_ngramdoc=new string[_numdocs][] ;

for (int i=0; i < docs.length ; i++)

{

tokeniser tokenizer=new tokeniser() ;

string[] words=tokenizer.partition(docs[i]);

for (int j=0; j < words.length ; j++)

if (!uniques.contains(words[j]) )

uniques.add(words[j]) ;

}

return uniques;

}

private static object addelement(idictionary collection, object key, object newvalue)

{

object element=collection[key];

collection[key]=newvalue;

return element;

}

private int gettermindex(string term)

{

object index=_wordsindex[term];

if (index == null) return -1;

return (int) index;

}

private void myinit()

{

_terms=generateterms (_docs );

_numterms=_terms.count ;

_maxtermfreq=new int[_numdocs] ;

_docfreq=new int[_numterms] ;

_termfreq =new int[_numterms][] ;

_termweight=new float[_numterms][] ;

for(int i=0; i < _terms.count ; i++)

{

_termweight[i]=new float[_numdocs] ;

_termfreq[i]=new int[_numdocs] ;

addelement(_wordsindex, _terms[i], i);

}

generatetermfrequency ();

generatetermweight();

}

private float log(float num)

{

return (float) math.log(num) ;//log2

}

private void generatetermfrequency()

{

for(int i=0; i < _numdocs ; i++)

{

string curdoc=_docs[i];

idictionary freq=getwordfrequency(curdoc);

idictionaryenumerator enums=freq.getenumerator() ;

_maxtermfreq[i]=int.minvalue ;

while (enums.movenext())

{

string word=(string)enums.key;

int wordfreq=(int)enums.value ;

int termindex=gettermindex(word);

_termfreq [termindex][i]=wordfreq;

_docfreq[termindex] ++;

if (wordfreq > _maxtermfreq[i]) _maxtermfreq[i]=wordfreq;

}

private void generatetermweight()

{

for(int i=0; i < _numterms ; i++)

{

for(int j=0; j < _numdocs ; j++)

_termweight[i][j]=computetermweight (i, j);

}

private float gettermfrequency(int term, int doc)

{

int freq=_termfreq [term][doc];

int maxfreq=_maxtermfreq[doc];

return ( (float) freq/(float)maxfreq );

}

private float getinversedocumentfrequency(int term)

{

int df=_docfreq[term];

return log((float) (_numdocs) / (float) df );

}

private float computetermweight(int term, int doc)

{

float tf=gettermfrequency (term, doc);

float idf=getinversedocumentfrequency(term);

return tf * idf;

}

private float[] gettermvector(int doc)

{

float[] w=new float[_numterms] ;

for (int i=0; i < _numterms; i++)

w[i]=_termweight[i][doc];

return w;

}

public float getsimilarity(int doc_i, int doc_j)

{

float[] vector1=gettermvector (doc_i);

float[] vector2=gettermvector (doc_j);

return termvector.computecosinesimilarity(vector1, vector2) ;

}

private idictionary getwordfrequency(string input)

{

string convertedinput=input.tolower() ;

tokeniser tokenizer=new tokeniser() ;

string[] words=tokenizer.partition(convertedinput);

array.sort(words);

string[] distinctwords=getdistinctwords(words);

idictionary result=new hashtable();

for (int i=0; i < distinctwords.length; i++)

{

object tmp;

tmp=countwords(distinctwords[i], words);

result[distinctwords[i]]=tmp;

}

return result;

}

private string[] getdistinctwords(string[] input)

{

if (input == null)

return new string[0];

else

{

arraylist list=new arraylist() ;

for (int i=0; i < input.length; i++)

if (!list.contains(input[i])) // n-gram similarity?

list.add(input[i]);

return tokeniser.arraylisttoarray(list) ;

}

private int countwords(string word, string[] words)

{

int itemidx=array.binarysearch(words, word);

if (itemidx > 0)

while (itemidx > 0 && words[itemidx].equals(word))

itemidx--;

int count=0;

while (itemidx < words.length && itemidx >= 0)

{

if (words[itemidx].equals(word)) count++;

itemidx++;

if (itemidx < words.length)

if (!words[itemidx].equals(word)) break;

}

return count;

}

缺点

由于有可能一个文章的特征向量词特别多导致整个向量维度很高，使得计算的代价太大不适合大数据量的计算。

simhash

原理

算法的主要思想是降维，将高维的特征向量映射成一个f-bit的指纹(fingerprint)，通过比较两篇文章的f-bit指纹的hamming distance来确定文章是否重复或者高度近似。由于每篇文章我们都可以事先计算好hamming distance来保存，到时候直接通过hamming distance来计算，所以速度非常快适合大数据计算。

google就是基于此算法实现网页文件查重的。我们假设有以下三段文本：

1，the cat sat on the mat

2，the cat sat on a mat

3，we all scream for ice cream

如何实现这种hash算法呢？以上述三个文本为例，整个过程可以分为以下六步：

1、选择simhash的位数，请综合考虑存储成本以及数据集的大小，比如说32位

2、将simhash的各位初始化为0

3、提取原始文本中的特征，一般采用各种分词的方式。比如对于"the cat sat on the mat"，采用两两分词的方式得到如下结果：{"th", "he", "e ", " c", "ca", "at", "t ", " s", "sa", " o", "on", "n ", " t", " m", "ma"}

4、使用传统的32位hash函数计算各个word的hashcode，比如："th".hash = -502157718

，"he".hash = -369049682，……

5、对各word的hashcode的每一位，如果该位为1，则simhash相应位的值加1；否则减1

6、对最后得到的32位的simhash，如果该位大于1，则设为1；否则设为0

您可能感兴趣的文章:

如对本文有疑问，请在下面进行留言讨论，广大热心网友会与你互动！！点击进行留言回复

Blazor server side 自家的一些开源的, 实用型项目的进度之 CEF客户端

距离上次提出 [Asp.Net Core] Blazor Server Side 扩展用途 - 配合CEF来制作带浏览器核心的客户端软件的想法后,&#... [阅读全文]
武装你的WEBAPI-OData入门

本文属于OData系列目录（可能会有后续修改） "武装你的WEBAPI OData入门" 武装你的WEBAPI OData便捷查询武装你的WEBAP... [阅读全文]
.NET IoC模式依赖反转(DIP)、控制反转(Ioc)、依赖注入(DI)

依赖倒置原则(DIP) 依赖倒置(Dependency Inversion Principle,缩写DIP)是面向对象六大基本原则之一。他是指一种特定的... [阅读全文]
DevExpress+Winform（四）

视频：https://www.bilibili.com/video/BV15x411x7WN?p=5 新建Devexpress Winform Blan... [阅读全文]
Jenkins之Nunit的应用

一、在Jenkins中安装Nunit插件进入jenkins的插件管理模块，下载Nunit插件。此步骤不做截图说明二、引用nunit.console的nu... [阅读全文]
vue+.netcore可支持业务代码扩展的开发框架 VOL.Vue 2.0版本发布

框架介绍这是一个基于vue、element-ui、iview、.netcore3.1 可支持前端、后台动态扩展业务代码快速开发框架。框架内置定制开发... [阅读全文]
微信退款（在.net core 用http方式请求）

微信JSAPI支付申请退款接口地址接口链接：https://api.mch.weixin.qq.com/secapi/pay/refund 是否需... [阅读全文]
Owin Katana 的底层源码分析

最近看了一下开源项目asp.net katana，感觉公开的接口非常的简洁优雅，channel 9 说是受到node.js的启发设计的，Katana是一... [阅读全文]
jenkins发布application且并运行

一、发布配置差异配置：编译内容编译目标NetWorkClient/KJ90NetClient.csproj编译命令/t:build/p:Configur... [阅读全文]
WPF 简易日期控件魔改ListBox

先上截图修正：应该将SetTime方法修改为，行号为207行开始修改 var nk = Day_of_week(year, month, 1); i... [阅读全文]

网友评论


验证码：

.NET下文本相似度算法余弦定理和SimHash浅析及应用

2018年09月13日 | 移动技术网IT编程 | 我要评论

您可能感兴趣的文章:

相关文章:

网友评论