- sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')
- msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8')
- msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8')
- msr_output = os.path.join(sighan05, 'testing', 'msr_output.txt')
- msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')
-
- DoubleArrayTrieSegment = JClass('com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment')
- segment = DoubleArrayTrieSegment([msr_dict]).enablePartOfSpeechTagging(True)
- with open(msr_gold, encoding='utf-8') as test, open(msr_output, 'w', encoding='utf-8') as output:
- for line in test:
- output.write(" ".join(term.word for term in segment.seg(re.sub("\\s+", "", line))))
- output.write("\n")
- print("P:%.2f R:%.2f F1:%.2f OOV-R:%.2f IV-R:%.2f" % prf(msr_gold, msr_output, segment.trie))