Open Chinese Convert  1.1.1
A project for conversion between Traditional and Simplified Chinese
PhraseExtract.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2015 Carbo Kuo <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #pragma once
20 
21 #include <functional>
22 #include <unordered_map>
23 
24 #include "Common.hpp"
25 #include "UTF8StringSlice.hpp"
26 
27 namespace opencc {
28 
29 class OPENCC_EXPORT PhraseExtract {
30 public:
31  typedef UTF8StringSlice::LengthType LengthType;
32 
34 
35  PhraseExtract();
36 
37  virtual ~PhraseExtract();
38 
39  void Extract(const std::string& text) {
40  SetFullText(text);
41  ExtractSuffixes();
42  CalculateFrequency();
43  CalculateSuffixEntropy();
44  ReleaseSuffixes();
45  ExtractPrefixes();
46  CalculatePrefixEntropy();
47  ReleasePrefixes();
48  ExtractWordCandidates();
49  CalculateCohesions();
50  SelectWords();
51  }
52 
53  void SetFullText(const std::string& fullText) {
54  utf8FullText = UTF8StringSlice(fullText.c_str());
55  }
56 
57  void SetFullText(const char* fullText) {
58  utf8FullText = UTF8StringSlice(fullText);
59  }
60 
61  void SetFullText(const UTF8StringSlice& fullText) { utf8FullText = fullText; }
62 
63  void SetWordMinLength(const LengthType _wordMinLength) {
64  wordMinLength = _wordMinLength;
65  }
66 
67  void SetWordMaxLength(const LengthType _wordMaxLength) {
68  wordMaxLength = _wordMaxLength;
69  }
70 
71  void SetPrefixSetLength(const LengthType _prefixSetLength) {
72  prefixSetLength = _prefixSetLength;
73  }
74 
75  void SetSuffixSetLength(const LengthType _suffixSetLength) {
76  suffixSetLength = _suffixSetLength;
77  }
78 
79  // PreCalculationFilter is called after frequencies statistics.
80  void SetPreCalculationFilter(
81  const std::function<bool(const PhraseExtract&,
82  const UTF8StringSlice8Bit&)>& filter) {
83  preCalculationFilter = filter;
84  }
85 
86  void SetPostCalculationFilter(
87  const std::function<bool(const PhraseExtract&,
88  const UTF8StringSlice8Bit&)>& filter) {
89  postCalculationFilter = filter;
90  }
91 
92  void ReleaseSuffixes() { std::vector<UTF8StringSlice8Bit>().swap(suffixes); }
93 
94  void ReleasePrefixes() { std::vector<UTF8StringSlice8Bit>().swap(prefixes); }
95 
96  const std::vector<UTF8StringSlice8Bit>& Words() const { return words; }
97 
98  const std::vector<UTF8StringSlice8Bit>& WordCandidates() const {
99  return wordCandidates;
100  }
101 
102  struct Signals {
103  size_t frequency;
104  double cohesion;
105  double suffixEntropy;
106  double prefixEntropy;
107  };
108 
109  const Signals& Signal(const UTF8StringSlice8Bit& wordCandidate) const;
110 
111  double Cohesion(const UTF8StringSlice8Bit& wordCandidate) const;
112 
113  double Entropy(const UTF8StringSlice8Bit& wordCandidate) const;
114 
115  double SuffixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
116 
117  double PrefixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
118 
119  size_t Frequency(const UTF8StringSlice8Bit& word) const;
120 
121  double Probability(const UTF8StringSlice8Bit& word) const;
122 
123  double LogProbability(const UTF8StringSlice8Bit& word) const;
124 
125  void Reset();
126 
127  void ExtractSuffixes();
128 
129  void ExtractPrefixes();
130 
131  void ExtractWordCandidates();
132 
133  void CalculateFrequency();
134 
135  void CalculateCohesions();
136 
137  void CalculateSuffixEntropy();
138 
139  void CalculatePrefixEntropy();
140 
141  void SelectWords();
142 
143  static bool
144  DefaultPreCalculationFilter(const PhraseExtract&,
146 
147  static bool
148  DefaultPostCalculationFilter(const PhraseExtract&,
150 
151 private:
152  class DictType;
153 
154  // Pointwise Mutual Information
155  double PMI(const UTF8StringSlice8Bit& wordCandidate,
156  const UTF8StringSlice8Bit& part1,
157  const UTF8StringSlice8Bit& part2) const;
158 
159  double CalculateCohesion(const UTF8StringSlice8Bit& wordCandidate) const;
160 
161  double CalculateEntropy(
162  const std::unordered_map<UTF8StringSlice8Bit, size_t,
163  UTF8StringSlice8Bit::Hasher>& choices) const;
164 
165  LengthType wordMinLength;
166  LengthType wordMaxLength;
167  LengthType prefixSetLength;
168  LengthType suffixSetLength;
169  std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
170  preCalculationFilter;
171  std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
172  postCalculationFilter;
173 
174  bool prefixesExtracted;
175  bool suffixesExtracted;
176  bool frequenciesCalculated;
177  bool wordCandidatesExtracted;
178  bool cohesionsCalculated;
179  bool prefixEntropiesCalculated;
180  bool suffixEntropiesCalculated;
181  bool wordsSelected;
182 
183  UTF8StringSlice utf8FullText;
184  size_t totalOccurrence;
185  double logTotalOccurrence;
186  std::vector<UTF8StringSlice8Bit> prefixes;
187  std::vector<UTF8StringSlice8Bit> suffixes;
188  std::vector<UTF8StringSlice8Bit> wordCandidates;
189  std::vector<UTF8StringSlice8Bit> words;
190  DictType* signals;
191 
192  friend class PhraseExtractTest;
193 };
194 
195 } // namespace opencc
opencc::UTF8StringSliceBase::Hasher
Definition: UTF8StringSlice.hpp:202
opencc::PhraseExtract::Signals
Definition: PhraseExtract.hpp:102
opencc::UTF8StringSliceBase
Definition: UTF8StringSlice.hpp:54
opencc::PhraseExtract
Definition: PhraseExtract.hpp:29