沧县政府,王爷给本妃笑一个,肖鹰博客
正向最大匹配
# -*- coding:utf-8 -*- codec='utf-8' def u(s, encoding): 'converted other encoding to unicode encoding' if isinstance(s, unicode): return s else: return unicode(s, encoding) def fwd_mm_seg(worddict, maxlen, str): 'forward max match segment' wordlist = [] segstr = str segstrlen = len(segstr) for word in worddict: print 'word: ', word print "\n" while segstrlen > 0: if segstrlen > maxlen: wordlen = maxlen else: wordlen = segstrlen substr = segstr[0:wordlen] print "substr: ", substr while wordlen > 1: if substr in worddict: print "substr1: %r" % substr break else: print "substr2: %r" % substr wordlen = wordlen - 1 substr = substr[0:wordlen] # print "substr3: ", substr wordlist.append(substr) segstr = segstr[wordlen:] segstrlen = segstrlen - wordlen for wordstr in wordlist: print "wordstr: ", wordstr return wordlist def main(): fp_dict = open('words.dic') worddict = {} for eachword in fp_dict: worddict[u(eachword.strip(), 'utf-8')] = 1 segstr = u'你好世界hello world' print segstr wordlist = fwd_mm_seg(worddict, 10, segstr) print "==".join(wordlist) if __name__ == '__main__': main()
逆向最大匹配
# -*- coding:utf-8 -*- def u(s, encoding): 'converted other encoding to unicode encoding' if isinstance(s, unicode): return s else: return unicode(s, encoding) codec='utf-8' def bwd_mm_seg(worddict, maxlen, str): 'forward max match segment' wordlist = [] segstr = str segstrlen = len(segstr) for word in worddict: print 'word: ', word print "\n" while segstrlen > 0: if segstrlen > maxlen: wordlen = maxlen else: wordlen = segstrlen substr = segstr[-wordlen:none] print "substr: ", substr while wordlen > 1: if substr in worddict: print "substr1: %r" % substr break else: print "substr2: %r" % substr wordlen = wordlen - 1 substr = substr[-wordlen:none] # print "substr3: ", substr wordlist.append(substr) segstr = segstr[0: -wordlen] segstrlen = segstrlen - wordlen wordlist.reverse() for wordstr in wordlist: print "wordstr: ", wordstr return wordlist def main(): fp_dict = open('words.dic') worddict = {} for eachword in fp_dict: worddict[u(eachword.strip(), 'utf-8')] = 1 segstr = ur'你好世界hello world' print segstr wordlist = bwd_mm_seg(worddict, 10, segstr) print "==".join(wordlist) if __name__ == '__main__': main()
以上这篇python正向最大匹配分词和逆向最大匹配分词的实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持移动技术网。
如对本文有疑问,请在下面进行留言讨论,广大热心网友会与你互动!! 点击进行留言回复
Python 实现将numpy中的nan和inf,nan替换成对应的均值
python爬虫把url链接编码成gbk2312格式过程解析
网友评论