SHOGUN  6.1.3
CommWordStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #include <shogun/base/Parameter.h>
12 #include <shogun/base/progress.h>
13 #include <shogun/io/SGIO.h>
14 #include <shogun/lib/common.h>
15 
18 
20 
21 using namespace shogun;
22 
24 : CStringKernel<uint16_t>()
25 {
26  init();
27 }
28 
30 : CStringKernel<uint16_t>(size)
31 {
32  init();
33  use_sign=s;
34 }
35 
38  bool s, int32_t size) : CStringKernel<uint16_t>(size)
39 {
40  init();
41  use_sign=s;
42 
43  init(l,r);
44 }
45 
46 
48 {
50  SG_DEBUG("using dictionary of %d words\n", size)
51  clear_normal();
52 
53  return dictionary_weights.vector!=NULL;
54 }
55 
57 {
58  cleanup();
59 
61 }
62 
63 bool CCommWordStringKernel::init(CFeatures* l, CFeatures* r)
64 {
66 
68  {
70  dict_diagonal_optimization=SG_MALLOC(int32_t, int32_t(((CStringFeatures<uint16_t>*)l)->get_num_symbols()));
71  ASSERT(((CStringFeatures<uint16_t>*)l)->get_num_symbols() == ((CStringFeatures<uint16_t>*)r)->get_num_symbols())
72  }
73 
74  return init_normalizer();
75 }
76 
78 {
81 }
82 
84 {
85  int32_t alen;
88 
89  bool free_av;
90  uint16_t* av=l->get_feature_vector(idx_a, alen, free_av);
91 
92  float64_t result=0.0 ;
93  ASSERT(l==r)
94  ASSERT(sizeof(uint16_t)<=sizeof(float64_t))
95  ASSERT((1<<(sizeof(uint16_t)*8)) > alen)
96 
97  int32_t num_symbols=(int32_t) l->get_num_symbols();
98  ASSERT(num_symbols<=dictionary_weights.vlen)
99 
100  int32_t* dic = dict_diagonal_optimization;
101  memset(dic, 0, num_symbols*sizeof(int32_t));
102 
103  for (int32_t i=0; i<alen; i++)
104  dic[av[i]]++;
105 
106  if (use_sign)
107  {
108  for (int32_t i=0; i<(int32_t) l->get_num_symbols(); i++)
109  {
110  if (dic[i]!=0)
111  result++;
112  }
113  }
114  else
115  {
116  for (int32_t i=0; i<num_symbols; i++)
117  {
118  if (dic[i]!=0)
119  result+=dic[i]*dic[i];
120  }
121  }
122  l->free_feature_vector(av, idx_a, free_av);
123 
124  return result;
125 }
126 
128  int32_t idx_a, int32_t idx_b, bool do_sort)
129 {
130  int32_t alen, blen;
131  bool free_av, free_bv;
132 
135 
136  uint16_t* av=l->get_feature_vector(idx_a, alen, free_av);
137  uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bv);
138 
139  uint16_t* avec=av;
140  uint16_t* bvec=bv;
141 
142  if (do_sort)
143  {
144  if (alen>0)
145  {
146  avec=SG_MALLOC(uint16_t, alen);
147  sg_memcpy(avec, av, sizeof(uint16_t)*alen);
148  CMath::radix_sort(avec, alen);
149  }
150  else
151  avec=NULL;
152 
153  if (blen>0)
154  {
155  bvec=SG_MALLOC(uint16_t, blen);
156  sg_memcpy(bvec, bv, sizeof(uint16_t)*blen);
157  CMath::radix_sort(bvec, blen);
158  }
159  else
160  bvec=NULL;
161  }
162  else
163  {
164  if ( (l->get_num_preprocessors() != l->get_num_preprocessed()) ||
166  {
167  SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
168  " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preprocessors(),
170  }
171  }
172 
173  float64_t result=0;
174 
175  int32_t left_idx=0;
176  int32_t right_idx=0;
177 
178  if (use_sign)
179  {
180  while (left_idx < alen && right_idx < blen)
181  {
182  if (avec[left_idx]==bvec[right_idx])
183  {
184  uint16_t sym=avec[left_idx];
185 
186  while (left_idx< alen && avec[left_idx]==sym)
187  left_idx++;
188 
189  while (right_idx< blen && bvec[right_idx]==sym)
190  right_idx++;
191 
192  result++;
193  }
194  else if (avec[left_idx]<bvec[right_idx])
195  left_idx++;
196  else
197  right_idx++;
198  }
199  }
200  else
201  {
202  while (left_idx < alen && right_idx < blen)
203  {
204  if (avec[left_idx]==bvec[right_idx])
205  {
206  int32_t old_left_idx=left_idx;
207  int32_t old_right_idx=right_idx;
208 
209  uint16_t sym=avec[left_idx];
210 
211  while (left_idx< alen && avec[left_idx]==sym)
212  left_idx++;
213 
214  while (right_idx< blen && bvec[right_idx]==sym)
215  right_idx++;
216 
217  result+=((float64_t) (left_idx-old_left_idx))*
218  ((float64_t) (right_idx-old_right_idx));
219  }
220  else if (avec[left_idx]<bvec[right_idx])
221  left_idx++;
222  else
223  right_idx++;
224  }
225  }
226 
227  if (do_sort)
228  {
229  SG_FREE(avec);
230  SG_FREE(bvec);
231  }
232 
233  l->free_feature_vector(av, idx_a, free_av);
234  r->free_feature_vector(bv, idx_b, free_bv);
235 
236  return result;
237 }
238 
239 void CCommWordStringKernel::add_to_normal(int32_t vec_idx, float64_t weight)
240 {
241  int32_t len=-1;
242  bool free_vec;
243  uint16_t* vec=((CStringFeatures<uint16_t>*) lhs)->
244  get_feature_vector(vec_idx, len, free_vec);
245 
246  if (len>0)
247  {
248  int32_t j, last_j=0;
249  if (use_sign)
250  {
251  for (j=1; j<len; j++)
252  {
253  if (vec[j]==vec[j-1])
254  continue;
255 
256  dictionary_weights[(int32_t) vec[j-1]]+=normalizer->
257  normalize_lhs(weight, vec_idx);
258  }
259 
260  dictionary_weights[(int32_t) vec[len-1]]+=normalizer->
261  normalize_lhs(weight, vec_idx);
262  }
263  else
264  {
265  for (j=1; j<len; j++)
266  {
267  if (vec[j]==vec[j-1])
268  continue;
269 
270  dictionary_weights[(int32_t) vec[j-1]]+=normalizer->
271  normalize_lhs(weight*(j-last_j), vec_idx);
272  last_j = j;
273  }
274 
275  dictionary_weights[(int32_t) vec[len-1]]+=normalizer->
276  normalize_lhs(weight*(len-last_j), vec_idx);
277  }
278  set_is_initialized(true);
279  }
280 
281  ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(vec, vec_idx, free_vec);
282 }
283 
285 {
287  set_is_initialized(false);
288 }
289 
291  int32_t count, int32_t* IDX, float64_t* weights)
292 {
294 
295  if (count<=0)
296  {
297  set_is_initialized(true);
298  SG_DEBUG("empty set of SVs\n")
299  return true;
300  }
301 
302  SG_DEBUG("initializing CCommWordStringKernel optimization\n")
303 
304  for (auto i : progress(range(0, count), *this->io))
305  {
306  add_to_normal(IDX[i], weights[i]);
307  }
308 
309  set_is_initialized(true);
310  return true;
311 }
312 
314 {
315  SG_DEBUG("deleting CCommWordStringKernel optimization\n")
316 
317  clear_normal();
318  return true;
319 }
320 
322 {
323  if (!get_is_initialized())
324  {
325  SG_ERROR("CCommWordStringKernel optimization not initialized\n")
326  return 0 ;
327  }
328 
329  float64_t result = 0;
330  int32_t len = -1;
331  bool free_vec;
332  uint16_t* vec=((CStringFeatures<uint16_t>*) rhs)->
333  get_feature_vector(i, len, free_vec);
334 
335  int32_t j, last_j=0;
336  if (vec && len>0)
337  {
338  if (use_sign)
339  {
340  for (j=1; j<len; j++)
341  {
342  if (vec[j]==vec[j-1])
343  continue;
344 
345  result += dictionary_weights[(int32_t) vec[j-1]];
346  }
347 
348  result += dictionary_weights[(int32_t) vec[len-1]];
349  }
350  else
351  {
352  for (j=1; j<len; j++)
353  {
354  if (vec[j]==vec[j-1])
355  continue;
356 
357  result += dictionary_weights[(int32_t) vec[j-1]]*(j-last_j);
358  last_j = j;
359  }
360 
361  result += dictionary_weights[(int32_t) vec[len-1]]*(len-last_j);
362  }
363 
364  result=normalizer->normalize_rhs(result, i);
365  }
366  ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(vec, i, free_vec);
367  return result;
368 }
369 
371  int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target,
372  int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init)
373 {
374  ASSERT(lhs)
376  num_feat=1;//str->get_max_vector_length();
377  CAlphabet* alpha=str->get_alphabet();
378  ASSERT(alpha)
379  int32_t num_bits=alpha->get_num_bits();
380  int32_t order=str->get_order();
381  ASSERT(max_degree<=order)
382  //int32_t num_words=(int32_t) str->get_num_symbols();
383  int32_t num_words=(int32_t) str->get_original_num_symbols();
384  int32_t offset=0;
385 
386  num_sym=0;
387 
388  for (int32_t i=0; i<order; i++)
389  num_sym+=CMath::pow((int32_t) num_words,i+1);
390 
391  SG_DEBUG("num_words:%d, order:%d, len:%d sz:%d (len*sz:%d)\n", num_words, order,
392  num_feat, num_sym, num_feat*num_sym);
393 
394  if (!target)
395  target=SG_MALLOC(float64_t, num_feat*num_sym);
396  memset(target, 0, num_feat*num_sym*sizeof(float64_t));
397 
398  if (do_init)
399  init_optimization(num_suppvec, IDX, alphas);
400 
401  uint32_t kmer_mask=0;
402  uint32_t words=CMath::pow((int32_t) num_words,(int32_t) order);
403 
404  for (int32_t o=0; o<max_degree; o++)
405  {
406  float64_t* contrib=&target[offset];
407  offset+=CMath::pow((int32_t) num_words,(int32_t) o+1);
408 
409  kmer_mask=(kmer_mask<<(num_bits)) | str->get_masked_symbols(0xffff, 1);
410 
411  for (int32_t p=-o; p<order; p++)
412  {
413  int32_t o_sym=0, m_sym=0, il=0,ir=0, jl=0;
414  uint32_t imer_mask=kmer_mask;
415  uint32_t jmer_mask=kmer_mask;
416 
417  if (p<0)
418  {
419  il=-p;
420  m_sym=order-o-p-1;
421  o_sym=-p;
422  }
423  else if (p<order-o)
424  {
425  ir=p;
426  m_sym=order-o-1;
427  }
428  else
429  {
430  ir=p;
431  m_sym=p;
432  o_sym=p-order+o+1;
433  jl=order-ir;
434  imer_mask=(kmer_mask>>(num_bits*o_sym));
435  jmer_mask=(kmer_mask>>(num_bits*jl));
436  }
437 
438  float64_t marginalizer=
439  1.0/CMath::pow((int32_t) num_words,(int32_t) m_sym);
440 
441  for (uint32_t i=0; i<words; i++)
442  {
443  uint16_t x= ((i << (num_bits*il)) >> (num_bits*ir)) & imer_mask;
444 
445  if (p>=0 && p<order-o)
446  {
447 //#define DEBUG_COMMSCORING
448 #ifdef DEBUG_COMMSCORING
449  SG_PRINT("o=%d/%d p=%d/%d i=0x%x x=0x%x imask=%x jmask=%x kmask=%x il=%d ir=%d marg=%g o_sym:%d m_sym:%d weight(",
450  o,order, p,order, i, x, imer_mask, jmer_mask, kmer_mask, il, ir, marginalizer, o_sym, m_sym);
451 
452  SG_PRINT("%c%c%c%c/%c%c%c%c)+=%g/%g\n",
453  alpha->remap_to_char((x>>(3*num_bits))&0x03), alpha->remap_to_char((x>>(2*num_bits))&0x03),
454  alpha->remap_to_char((x>>num_bits)&0x03), alpha->remap_to_char(x&0x03),
455  alpha->remap_to_char((i>>(3*num_bits))&0x03), alpha->remap_to_char((i>>(2*num_bits))&0x03),
456  alpha->remap_to_char((i>>(1*num_bits))&0x03), alpha->remap_to_char(i&0x03),
457  dictionary_weights[i]*marginalizer, dictionary_weights[i]);
458 #endif
459  contrib[x]+=dictionary_weights[i]*marginalizer;
460  }
461  else
462  {
463  for (uint32_t j=0; j< (uint32_t) CMath::pow((int32_t) num_words, (int32_t) o_sym); j++)
464  {
465  uint32_t c=x | ((j & jmer_mask) << (num_bits*jl));
466 #ifdef DEBUG_COMMSCORING
467 
468  SG_PRINT("o=%d/%d p=%d/%d i=0x%x j=0x%x x=0x%x c=0x%x imask=%x jmask=%x kmask=%x il=%d ir=%d jl=%d marg=%g o_sym:%d m_sym:%d weight(",
469  o,order, p,order, i, j, x, c, imer_mask, jmer_mask, kmer_mask, il, ir, jl, marginalizer, o_sym, m_sym);
470  SG_PRINT("%c%c%c%c/%c%c%c%c)+=%g/%g\n",
471  alpha->remap_to_char((c>>(3*num_bits))&0x03), alpha->remap_to_char((c>>(2*num_bits))&0x03),
472  alpha->remap_to_char((c>>num_bits)&0x03), alpha->remap_to_char(c&0x03),
473  alpha->remap_to_char((i>>(3*num_bits))&0x03), alpha->remap_to_char((i>>(2*num_bits))&0x03),
474  alpha->remap_to_char((i>>(1*num_bits))&0x03), alpha->remap_to_char(i&0x03),
475  dictionary_weights[i]*marginalizer, dictionary_weights[i]);
476 #endif
477  contrib[c]+=dictionary_weights[i]*marginalizer;
478  }
479  }
480  }
481  }
482  }
483 
484  for (int32_t i=1; i<num_feat; i++)
485  sg_memcpy(&target[num_sym*i], target, num_sym*sizeof(float64_t));
486 
487  SG_UNREF(alpha);
488 
489  return target;
490 }
491 
492 
494  int32_t &result_len, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
495 {
496  ASSERT(lhs)
497  ASSERT(IDX)
498  ASSERT(alphas)
499 
501  int32_t num_words=(int32_t) str->get_num_symbols();
502  int32_t num_feat=str->get_max_vector_length();
503  int64_t total_len=((int64_t) num_feat) * num_words;
505  ASSERT(alpha)
506  int32_t num_bits=alpha->get_num_bits();
507  int32_t order=str->get_order();
508  int32_t max_idx=-1;
509  float64_t max_score=0;
510  result_len=num_feat+order-1;
511 
512  //init
513  init_optimization(num_suppvec, IDX, alphas);
514 
515  char* result=SG_MALLOC(char, result_len);
516  int32_t* bt=SG_MALLOC(int32_t, total_len);
517  float64_t* score=SG_MALLOC(float64_t, total_len);
518 
519  for (int64_t i=0; i<total_len; i++)
520  {
521  bt[i]=-1;
522  score[i]=0;
523  }
524 
525  for (int32_t t=0; t<num_words; t++)
526  score[t]=dictionary_weights[t];
527 
528  //dynamic program
529  for (int32_t i=1; i<num_feat; i++)
530  {
531  for (int32_t t1=0; t1<num_words; t1++)
532  {
533  max_idx=-1;
534  max_score=0;
535 
536  /* ignore weights the svm does not care about
537  * (has not seen in training). note that this assumes that zero
538  * weights are very unlikely to appear elsewise */
539 
540  //if (dictionary_weights[t1]==0.0)
541  //continue;
542 
543  /* iterate over words t ending on t1 and find the highest scoring
544  * pair */
545  uint16_t suffix=(uint16_t) t1 >> num_bits;
546 
547  for (int32_t sym=0; sym<str->get_original_num_symbols(); sym++)
548  {
549  uint16_t t=suffix | sym << (num_bits*(order-1));
550 
551  //if (dictionary_weights[t]==0.0)
552  // continue;
553 
554  float64_t sc=score[num_words*(i-1) + t]+dictionary_weights[t1];
555  if (sc > max_score || max_idx==-1)
556  {
557  max_idx=t;
558  max_score=sc;
559  }
560  }
561  ASSERT(max_idx!=-1)
562 
563  score[num_words*i + t1]=max_score;
564  bt[num_words*i + t1]=max_idx;
565  }
566  }
567 
568  //backtracking
569  max_idx=0;
570  max_score=score[num_words*(num_feat-1) + 0];
571  for (int32_t t=1; t<num_words; t++)
572  {
573  float64_t sc=score[num_words*(num_feat-1) + t];
574  if (sc>max_score)
575  {
576  max_idx=t;
577  max_score=sc;
578  }
579  }
580 
581  SG_DEBUG("max_idx:%i, max_score:%f\n", max_idx, max_score)
582 
583  for (int32_t i=result_len-1; i>=num_feat; i--)
584  result[i]=alpha->remap_to_char( (uint8_t) str->get_masked_symbols( (uint16_t) max_idx >> (num_bits*(result_len-1-i)), 1) );
585 
586  for (int32_t i=num_feat-1; i>=0; i--)
587  {
588  result[i]=alpha->remap_to_char( (uint8_t) str->get_masked_symbols( (uint16_t) max_idx >> (num_bits*(order-1)), 1) );
589  max_idx=bt[num_words*i + max_idx];
590  }
591 
592  SG_FREE(bt);
593  SG_FREE(score);
594  SG_UNREF(alpha);
595  return result;
596 }
597 
598 void CCommWordStringKernel::init()
599 {
600  use_sign=false;
603 
605  init_dictionary(1<<(sizeof(uint16_t)*8));
607 
608  SG_ADD(&dictionary_weights, "dictionary_weights",
609  "Dictionary for applying kernel.", MS_NOT_AVAILABLE);
610  SG_ADD(&use_sign, "use_sign",
611  "If signum(counts) is used instead of counts.", MS_AVAILABLE);
613  "use_dict_diagonal_optimization", "If K(x,x) is computed potentially "
614  "more efficiently.", MS_NOT_AVAILABLE);
615 }
virtual float64_t compute_optimized(int32_t idx)
virtual void cleanup()
Definition: Kernel.cpp:172
virtual bool init_optimization(int32_t count, int32_t *IDX, float64_t *weights)
virtual bool init_dictionary(int32_t size)
PRange< T > progress(Range< T > range, const SGIO &io, std::string prefix="PROGRESS: ", SG_PRG_MODE mode=UTF8, std::function< bool()> condition=[](){return true;})
Definition: progress.h:712
virtual bool set_normalizer(CKernelNormalizer *normalizer)
Definition: Kernel.cpp:149
EAlphabet get_alphabet() const
Definition: Alphabet.h:130
virtual float64_t normalize_rhs(float64_t value, int32_t idx_rhs)=0
#define SG_ERROR(...)
Definition: SGIO.h:128
void set_is_initialized(bool p_init)
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
ST get_masked_symbols(ST symbol, uint8_t mask)
Range< T > range(T rend)
Definition: range.h:136
virtual float64_t compute_helper(int32_t idx_a, int32_t idx_b, bool do_sort)
bool get_is_initialized()
int32_t get_num_preprocessors() const
Definition: Features.cpp:155
#define SG_PRINT(...)
Definition: SGIO.h:136
#define ASSERT(x)
Definition: SGIO.h:176
virtual float64_t compute_diag(int32_t idx_a)
floatmax_t get_original_num_symbols()
double float64_t
Definition: common.h:60
static void radix_sort(T *array, int32_t size)
Definition: Math.h:1199
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
SGVector< ST > get_feature_vector(int32_t num)
virtual bool init(CFeatures *l, CFeatures *r)
int32_t get_num_bits() const
Definition: Alphabet.h:149
virtual bool init_normalizer()
Definition: Kernel.cpp:167
CFeatures * rhs
feature vectors to occur on right hand side
#define SG_UNREF(x)
Definition: SGObject.h:53
#define SG_DEBUG(...)
Definition: SGIO.h:106
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual float64_t * compute_scoring(int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *target, int32_t num_suppvec, int32_t *IDX, float64_t *alphas, bool do_init=true)
int32_t get_num_preprocessed() const
Definition: Features.cpp:103
CFeatures * lhs
feature vectors to occur on left hand side
The class Features is the base class of all feature objects.
Definition: Features.h:69
char * compute_consensus(int32_t &num_feat, int32_t num_suppvec, int32_t *IDX, float64_t *alphas)
virtual int32_t get_max_vector_length()
CKernelNormalizer * normalizer
uint8_t remap_to_char(uint8_t c)
Definition: Alphabet.h:169
#define SG_ADD(...)
Definition: SGObject.h:93
virtual void add_to_normal(int32_t idx, float64_t weight)
SGVector< float64_t > dictionary_weights
Template class StringKernel, is the base class of all String Kernels.
Definition: StringKernel.h:26
static int32_t pow(bool x, int32_t n)
Definition: Math.h:474
index_t vlen
Definition: SGVector.h:571

SHOGUN Machine Learning Toolbox - Documentation