35 "SmoothedNgramPredictor, a linear interpolating n-gram predictor",
36 "SmoothedNgramPredictor, long description." ),
39 learn_mode_set (false),
81 std::stringstream ss_deltas(value);
84 while (ss_deltas >> delta) {
85 logger << DEBUG <<
"Pushing delta: " << delta <<
endl;
142 const char separator[] =
"|";
143 std::string result = separator;
145 for (Ngram::const_iterator it = ngram.begin();
149 result += *it + separator;
173 unsigned int result = 0;
176 assert(ngram_size >= 0);
178 if (ngram_size > 0) {
179 Ngram ngram(ngram_size);
180 copy(tokens.end() - ngram_size + offset , tokens.end() + offset, ngram.begin());
185 logger << DEBUG <<
"unigram counts sum: " << result <<
endl;
205 logger << DEBUG <<
"Cached tokens[" << cardinality - 1 - i <<
"] = " << tokens[cardinality - 1 - i] <<
endl;
220 std::vector<std::string> prefixCompletionCandidates;
221 for (
size_t k = cardinality; (k > 0 && prefixCompletionCandidates.size() < max_partial_prediction_size); k--) {
222 logger << DEBUG <<
"Building partial prefix completion table of cardinality: " << k <<
endl;
224 Ngram prefix_ngram(k);
225 copy(tokens.end() - k, tokens.end(), prefix_ngram.begin());
228 logger << DEBUG <<
"prefix_ngram: ";
229 for (
size_t r = 0; r < prefix_ngram.size(); r++) {
230 logger << DEBUG << prefix_ngram[r] <<
' ';
241 partial =
db->
getNgramLikeTable(prefix_ngram,max_partial_prediction_size - prefixCompletionCandidates.size());
249 logger << DEBUG <<
"partial prefixCompletionCandidates" << endl
250 << DEBUG <<
"----------------------------------" <<
endl;
251 for (
size_t j = 0; j < partial.size(); j++) {
252 for (
size_t k = 0; k < partial[j].size(); k++) {
253 logger << DEBUG << partial[j][k] <<
" ";
259 logger << DEBUG <<
"Partial prefix completion table contains " << partial.size() <<
" potential completions." <<
endl;
265 std::vector<Ngram>::const_iterator it = partial.begin();
266 while (it != partial.end() && prefixCompletionCandidates.size() < max_partial_prediction_size) {
270 std::string candidate = *(it->end() - 2);
271 if (find(prefixCompletionCandidates.begin(),
272 prefixCompletionCandidates.end(),
273 candidate) == prefixCompletionCandidates.end()) {
274 prefixCompletionCandidates.push_back(candidate);
281 logger << DEBUG <<
"prefixCompletionCandidates" << endl
282 << DEBUG <<
"--------------------------" <<
endl;
283 for (
size_t j = 0; j < prefixCompletionCandidates.size(); j++) {
284 logger << DEBUG << prefixCompletionCandidates[j] <<
endl;
294 for (
size_t j = 0; (j < prefixCompletionCandidates.size() && j < max_partial_prediction_size); j++) {
296 tokens[cardinality - 1] = prefixCompletionCandidates[j];
298 logger << DEBUG <<
"------------------" <<
endl;
299 logger << DEBUG <<
"w_i: " << tokens[cardinality - 1] <<
endl;
301 double probability = 0;
303 double numerator =
count(tokens, 0, k+1);
305 double denominator = (k == 0 ? unigrams_counts_sum :
count(tokens, -1, k));
306 double frequency = ((denominator > 0) ? (numerator / denominator) : 0);
307 probability +=
deltas[k] * frequency;
309 logger << DEBUG <<
"numerator: " << numerator <<
endl;
310 logger << DEBUG <<
"denominator: " << denominator <<
endl;
311 logger << DEBUG <<
"frequency: " << frequency <<
endl;
315 assert(numerator <= denominator);
316 assert(frequency <= 1);
320 logger << DEBUG <<
"probability: " << probability <<
endl;
322 if (probability > 0) {
346 for (
size_t curr_cardinality = 1;
348 curr_cardinality++) {
350 logger << DEBUG <<
"Learning for n-gram cardinality: " << curr_cardinality <<
endl;
353 for (std::vector<std::string>::const_reverse_iterator idx = change.rbegin();
354 idx != change.rend();
361 for (std::vector<std::string>::const_reverse_iterator inner_idx = idx;
362 inner_idx != change.rend() && ngram.size() < curr_cardinality;
365 ngram.insert(ngram.begin(), *inner_idx);
372 ngram.size() < curr_cardinality;
380 logger << DEBUG <<
"Adding extra token: " << extra_token <<
endl;
381 ngram.insert(ngram.begin(), extra_token);
385 logger << INFO <<
"Considering to learn ngram: |";
386 for (
size_t j = 0; j < ngram.size(); j++) {
387 logger << INFO << ngram[j] <<
'|';
391 if (ngram.end() == find(ngram.begin(), ngram.end(),
"")) {
403 logger << INFO <<
"Committed learning update to database" <<
endl;
408 logger << ERROR <<
"Rolling back learning update : " << ex.
what() <<
endl;
424 size_t size = ngram.size();
425 for (
size_t i = 0; i < size; i++) {
426 if (
count(ngram, -i, size - i) >
count(ngram, -(i + 1), size - (i + 1))) {
427 logger << INFO <<
"consistency adjustment needed!" <<
endl;
429 int offset = -(i + 1);
430 int sub_ngram_size = size - (i + 1);
432 logger << DEBUG <<
"i: " << i <<
" | offset: " << offset <<
" | sub_ngram_size: " << sub_ngram_size <<
endl;
434 Ngram sub_ngram(sub_ngram_size);
435 copy(ngram.end() - sub_ngram_size + offset, ngram.end() + offset, sub_ngram.begin());
438 logger <<
"ngram to be count adjusted is: ";
439 for (
size_t i = 0; i < sub_ngram.size(); i++) {
440 logger << sub_ngram[i] <<
' ';
446 logger << DEBUG <<
"consistency adjusted" <<
endl;
void dispatch(const Observable *var)
virtual void beginTransaction() const
~SmoothedNgramPredictor()
void check_learn_consistency(const Ngram &name) const
Dispatcher< SmoothedNgramPredictor > dispatcher
NgramTable getNgramLikeTableFiltered(const Ngram ngram, const char **filter, int limit=-1) const
int getUnigramCountsSum() const
Variable * find(const std::string &variable) const
std::string getExtraTokenToLearn(const int index, const std::vector< std::string > &change) const
virtual void learn(const std::vector< std::string > &change)
int getNgramCount(const Ngram ngram) const
void set_database_logger_level(const std::string &level)
virtual Prediction predict(const size_t size, const char **filter) const
Generate prediction.
virtual void set_logger(const std::string &level)
unsigned int count(const std::vector< std::string > &tokens, int offset, int ngram_size) const
Builds the required n-gram and returns its count.
void set_deltas(const std::string &deltas)
const std::string PREDICTORS
static std::string ngram_to_string(const Ngram &ngram)
virtual void endTransaction() const
std::vector< double > deltas
void init_database_connector_if_ready()
static double toDouble(const std::string)
virtual void update(const Observable *variable)
std::vector< Ngram > NgramTable
void set_learn(const std::string &learn_mode)
virtual std::string get_name() const =0
SmoothedNgramPredictor(Configuration *, ContextTracker *, const char *)
void map(Observable *var, const mbr_func_ptr_t &ptr)
ContextTracker * contextTracker
std::string getToken(const int) const
void addSuggestion(Suggestion)
std::string DATABASE_LOGGER
Tracks user interaction and context.
static bool isTrue(const char *)
virtual std::string get_value() const =0
virtual void rollbackTransaction() const
virtual const char * what() const
void set_dbfilename(const std::string &filename)
int incrementNgramCount(const Ngram ngram) const
NgramTable getNgramLikeTable(const Ngram ngram, int limit=-1) const
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)