SHOGUN  6.1.3
WeightedDegreePositionStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #include <shogun/base/Parallel.h>
13 #include <shogun/base/progress.h>
14 #include <shogun/io/SGIO.h>
15 #include <shogun/lib/Signal.h>
16 #include <shogun/lib/Trie.h>
17 #include <shogun/lib/common.h>
18 
23 
25 
26 #ifdef HAVE_PTHREAD
27 #include <pthread.h>
28 
29 #endif
30 
31 using namespace shogun;
32 
33 #define TRIES(X) ((use_poim_tries) ? (poim_tries.X) : (tries.X))
34 
35 #ifndef DOXYGEN_SHOULD_SKIP_THIS
36 template <class Trie> struct S_THREAD_PARAM_WDS
37 {
38  int32_t* vec;
39  float64_t* result;
40  float64_t* weights;
42  CTrie<Trie>* tries;
43  float64_t factor;
44  int32_t j;
45  int32_t start;
46  int32_t end;
47  int32_t length;
48  int32_t max_shift;
49  int32_t* shift;
50  int32_t* vec_idx;
51 };
52 #endif // DOXYGEN_SHOULD_SKIP_THIS
53 
55  void)
56 : CStringKernel<char>()
57 {
58  init();
59 }
60 
62  int32_t size, int32_t d, int32_t mm, int32_t mkls)
63 : CStringKernel<char>(size)
64 {
65  init();
66 
67  mkl_stepsize=mkls;
68  degree=d;
69  max_mismatch=mm;
70 
73 
76 }
77 
79  int32_t size, SGVector<float64_t> w, int32_t d, int32_t mm, SGVector<int32_t> s,
80  int32_t mkls)
81 : CStringKernel<char>(size)
82 {
83  init();
84 
85  mkl_stepsize=mkls;
86  degree=d;
87  max_mismatch=mm;
88 
91 
92  weights=SG_MALLOC(float64_t, d*(1+max_mismatch));
95 
96  for (int32_t i=0; i<d*(1+max_mismatch); i++)
97  weights[i]=w[i];
98 
99  set_shifts(s);
100 }
101 
104 : CStringKernel<char>()
105 {
106  init();
107 
108  mkl_stepsize=1;
109  degree=d;
110 
113 
114  set_wd_weights();
115  ASSERT(weights)
116 
117  init(l, r);
118 }
119 
120 
122 {
123  cleanup();
124  cleanup_POIM2();
125 
126  SG_FREE(shift);
127  shift=NULL;
128 
129  SG_FREE(weights);
130  weights=NULL;
131  weights_degree=0;
132  weights_length=0;
133 
134  SG_FREE(block_weights);
135  block_weights=NULL;
136 
137  SG_FREE(position_weights);
138  position_weights=NULL;
139 
140  SG_FREE(position_weights_lhs);
142 
143  SG_FREE(position_weights_rhs);
145 
146  SG_FREE(weights_buffer);
147  weights_buffer=NULL;
148 }
149 
151 {
152  SG_DEBUG("deleting CWeightedDegreePositionStringKernel optimization\n")
154 
155  tries.destroy();
157 
159 }
160 
162 {
163  ASSERT(lhs)
164  seq_length = ((CStringFeatures<char>*) lhs)->get_max_vector_length();
165 
167  {
168  tries.create(seq_length, true);
170  }
171  else if (opt_type==FASTBUTMEMHUNGRY)
172  {
173  tries.create(seq_length, false); // still buggy
174  poim_tries.create(seq_length, false); // still buggy
175  }
176  else
177  SG_ERROR("unknown optimization type\n")
178 }
179 
180 bool CWeightedDegreePositionStringKernel::init(CFeatures* l, CFeatures* r)
181 {
182  int32_t lhs_changed = (lhs!=l) ;
183  int32_t rhs_changed = (rhs!=r) ;
184 
186 
187  SG_DEBUG("lhs_changed: %i\n", lhs_changed)
188  SG_DEBUG("rhs_changed: %i\n", rhs_changed)
189 
192 
193  /* set shift */
194  if (shift_len==0) {
195  shift_len=sf_l->get_vector_length(0);
196  int32_t *shifts=SG_MALLOC(int32_t, shift_len);
197  for (int32_t i=0; i<shift_len; i++) {
198  shifts[i]=1;
199  }
200  set_shifts(SGVector<int32_t>(shifts, shift_len, false));
201  SG_FREE(shifts);
202  }
203 
204 
205  int32_t len=sf_l->get_max_vector_length();
206  if (lhs_changed && !sf_l->have_same_length(len))
207  SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n")
208 
209  if (rhs_changed && !sf_r->have_same_length(len))
210  SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n")
211 
213  alphabet= sf_l->get_alphabet();
214  CAlphabet* ralphabet=sf_r->get_alphabet();
215 
216  if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
217  properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
218 
219  ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet())
220  SG_UNREF(ralphabet);
221 
222  //whenever init is called also init tries and block weights
225 
226  return init_normalizer();
227 }
228 
230 {
231  SG_DEBUG("deleting CWeightedDegreePositionStringKernel optimization\n")
233 
234  SG_FREE(block_weights);
235  block_weights=NULL;
236 
237  tries.destroy();
239 
240  seq_length = 0;
241  tree_initialized = false;
242 
244  alphabet=NULL;
245 
247 }
248 
250  int32_t p_count, int32_t * IDX, float64_t * alphas, int32_t tree_num,
251  int32_t upto_tree)
252 {
255 
256  if (upto_tree<0)
257  upto_tree=tree_num;
258 
259  if (max_mismatch!=0)
260  {
261  SG_ERROR("CWeightedDegreePositionStringKernel optimization not implemented for mismatch!=0\n")
262  return false ;
263  }
264 
265  if (tree_num<0)
266  SG_DEBUG("deleting CWeightedDegreePositionStringKernel optimization\n")
267 
269 
270  if (tree_num<0)
271  SG_DEBUG("initializing CWeightedDegreePositionStringKernel optimization\n")
272 
273  for (auto i : progress(range(p_count), *this->io))
274  {
275  if (tree_num<0)
276  {
277  add_example_to_tree(IDX[i], alphas[i]);
278  }
279  else
280  {
281  for (int32_t t=tree_num; t<=upto_tree; t++)
282  add_example_to_single_tree(IDX[i], alphas[i], t);
283  }
284  }
285 
286  set_is_initialized(true) ;
287  return true ;
288 }
289 
291 {
293  {
295  SG_DEBUG("disabling compact trie nodes with FASTBUTMEMHUNGRY\n")
296  }
297 
298  if (get_is_initialized())
299  {
301  tries.delete_trees(true);
302  else if (opt_type==FASTBUTMEMHUNGRY)
303  tries.delete_trees(false); // still buggy
304  else {
305  SG_ERROR("unknown optimization type\n")
306  }
307  set_is_initialized(false);
308 
309  return true;
310  }
311 
312  return false;
313 }
314 
316  char* avec, int32_t alen, char* bvec, int32_t blen)
317 {
318  float64_t* max_shift_vec= SG_MALLOC(float64_t, max_shift);
319  float64_t sum0=0 ;
320  for (int32_t i=0; i<max_shift; i++)
321  max_shift_vec[i]=0 ;
322 
323  // no shift
324  for (int32_t i=0; i<alen; i++)
325  {
326  if ((position_weights!=NULL) && (position_weights[i]==0.0))
327  continue ;
328 
329  int32_t mismatches=0;
330  float64_t sumi = 0.0 ;
331  for (int32_t j=0; (j<degree) && (i+j<alen); j++)
332  {
333  if (avec[i+j]!=bvec[i+j])
334  {
335  mismatches++ ;
336  if (mismatches>max_mismatch)
337  break ;
338  } ;
339  sumi += weights[j+degree*mismatches];
340  }
341  if (position_weights!=NULL)
342  sum0 += position_weights[i]*sumi ;
343  else
344  sum0 += sumi ;
345  } ;
346 
347  for (int32_t i=0; i<alen; i++)
348  {
349  for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
350  {
351  if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
352  continue ;
353 
354  float64_t sumi1 = 0.0 ;
355  // shift in sequence a
356  int32_t mismatches=0;
357  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
358  {
359  if (avec[i+j+k]!=bvec[i+j])
360  {
361  mismatches++ ;
362  if (mismatches>max_mismatch)
363  break ;
364  } ;
365  sumi1 += weights[j+degree*mismatches];
366  }
367  float64_t sumi2 = 0.0 ;
368  // shift in sequence b
369  mismatches=0;
370  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
371  {
372  if (avec[i+j]!=bvec[i+j+k])
373  {
374  mismatches++ ;
375  if (mismatches>max_mismatch)
376  break ;
377  } ;
378  sumi2 += weights[j+degree*mismatches];
379  }
380  if (position_weights!=NULL)
381  max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
382  else
383  max_shift_vec[k-1] += sumi1 + sumi2 ;
384  } ;
385  }
386 
387  float64_t result = sum0 ;
388  for (int32_t i=0; i<max_shift; i++)
389  result += max_shift_vec[i]/(2*(i+1)) ;
390 
391  SG_FREE(max_shift_vec);
392  return result ;
393 }
394 
396  char* avec, int32_t alen, char* bvec, int32_t blen)
397 {
398  float64_t* max_shift_vec = SG_MALLOC(float64_t, max_shift);
399  float64_t sum0=0 ;
400  for (int32_t i=0; i<max_shift; i++)
401  max_shift_vec[i]=0 ;
402 
403  // no shift
404  for (int32_t i=0; i<alen; i++)
405  {
406  if ((position_weights!=NULL) && (position_weights[i]==0.0))
407  continue ;
408 
409  float64_t sumi = 0.0 ;
410  for (int32_t j=0; (j<degree) && (i+j<alen); j++)
411  {
412  if (avec[i+j]!=bvec[i+j])
413  break ;
414  sumi += weights[j];
415  }
416  if (position_weights!=NULL)
417  sum0 += position_weights[i]*sumi ;
418  else
419  sum0 += sumi ;
420  } ;
421 
422  for (int32_t i=0; i<alen; i++)
423  {
424  for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
425  {
426  if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
427  continue ;
428 
429  float64_t sumi1 = 0.0 ;
430  // shift in sequence a
431  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
432  {
433  if (avec[i+j+k]!=bvec[i+j])
434  break ;
435  sumi1 += weights[j];
436  }
437  float64_t sumi2 = 0.0 ;
438  // shift in sequence b
439  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
440  {
441  if (avec[i+j]!=bvec[i+j+k])
442  break ;
443  sumi2 += weights[j];
444  }
445  if (position_weights!=NULL)
446  max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
447  else
448  max_shift_vec[k-1] += sumi1 + sumi2 ;
449  } ;
450  }
451 
452  float64_t result = sum0 ;
453  for (int32_t i=0; i<max_shift; i++)
454  result += max_shift_vec[i]/(2*(i+1)) ;
455 
456  SG_FREE(max_shift_vec);
457 
458  return result ;
459 }
460 
462  char* avec, int32_t alen, char* bvec, int32_t blen)
463 {
464  float64_t* max_shift_vec = SG_MALLOC(float64_t, max_shift);
465  float64_t sum0=0 ;
466  for (int32_t i=0; i<max_shift; i++)
467  max_shift_vec[i]=0 ;
468 
469  // no shift
470  for (int32_t i=0; i<alen; i++)
471  {
472  if ((position_weights!=NULL) && (position_weights[i]==0.0))
473  continue ;
474  float64_t sumi = 0.0 ;
475  for (int32_t j=0; (j<degree) && (i+j<alen); j++)
476  {
477  if (avec[i+j]!=bvec[i+j])
478  break ;
479  sumi += weights[i*degree+j];
480  }
481  if (position_weights!=NULL)
482  sum0 += position_weights[i]*sumi ;
483  else
484  sum0 += sumi ;
485  } ;
486 
487  for (int32_t i=0; i<alen; i++)
488  {
489  for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
490  {
491  if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
492  continue ;
493 
494  float64_t sumi1 = 0.0 ;
495  // shift in sequence a
496  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
497  {
498  if (avec[i+j+k]!=bvec[i+j])
499  break ;
500  sumi1 += weights[i*degree+j];
501  }
502  float64_t sumi2 = 0.0 ;
503  // shift in sequence b
504  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
505  {
506  if (avec[i+j]!=bvec[i+j+k])
507  break ;
508  sumi2 += weights[i*degree+j];
509  }
510  if (position_weights!=NULL)
511  max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
512  else
513  max_shift_vec[k-1] += sumi1 + sumi2 ;
514  } ;
515  }
516 
517  float64_t result = sum0 ;
518  for (int32_t i=0; i<max_shift; i++)
519  result += max_shift_vec[i]/(2*(i+1)) ;
520 
521  SG_FREE(max_shift_vec);
522  return result ;
523 }
524 
526  char* avec, float64_t* pos_weights_lhs, int32_t alen, char* bvec,
527  float64_t* pos_weights_rhs, int32_t blen)
528 {
529  float64_t* max_shift_vec = SG_MALLOC(float64_t, max_shift);
530  float64_t sum0=0 ;
531  for (int32_t i=0; i<max_shift; i++)
532  max_shift_vec[i]=0 ;
533 
534  // no shift
535  for (int32_t i=0; i<alen; i++)
536  {
537  if ((position_weights!=NULL) && (position_weights[i]==0.0))
538  continue ;
539 
540  float64_t sumi = 0.0 ;
541  float64_t posweight_lhs = 0.0 ;
542  float64_t posweight_rhs = 0.0 ;
543  for (int32_t j=0; (j<degree) && (i+j<alen); j++)
544  {
545  posweight_lhs += pos_weights_lhs[i+j] ;
546  posweight_rhs += pos_weights_rhs[i+j] ;
547 
548  if (avec[i+j]!=bvec[i+j])
549  break ;
550  sumi += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
551  }
552  if (position_weights!=NULL)
553  sum0 += position_weights[i]*sumi ;
554  else
555  sum0 += sumi ;
556  } ;
557 
558  for (int32_t i=0; i<alen; i++)
559  {
560  for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
561  {
562  if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
563  continue ;
564 
565  // shift in sequence a
566  float64_t sumi1 = 0.0 ;
567  float64_t posweight_lhs = 0.0 ;
568  float64_t posweight_rhs = 0.0 ;
569  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
570  {
571  posweight_lhs += pos_weights_lhs[i+j+k] ;
572  posweight_rhs += pos_weights_rhs[i+j] ;
573  if (avec[i+j+k]!=bvec[i+j])
574  break ;
575  sumi1 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
576  }
577  // shift in sequence b
578  float64_t sumi2 = 0.0 ;
579  posweight_lhs = 0.0 ;
580  posweight_rhs = 0.0 ;
581  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
582  {
583  posweight_lhs += pos_weights_lhs[i+j] ;
584  posweight_rhs += pos_weights_rhs[i+j+k] ;
585  if (avec[i+j]!=bvec[i+j+k])
586  break ;
587  sumi2 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
588  }
589  if (position_weights!=NULL)
590  max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
591  else
592  max_shift_vec[k-1] += sumi1 + sumi2 ;
593  } ;
594  }
595 
596  float64_t result = sum0 ;
597  for (int32_t i=0; i<max_shift; i++)
598  result += max_shift_vec[i]/(2*(i+1)) ;
599 
600  SG_FREE(max_shift_vec);
601  return result ;
602 }
603 
604 
606  int32_t idx_a, int32_t idx_b)
607 {
608  int32_t alen, blen;
609  bool free_avec, free_bvec;
610 
611  char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
612  char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
613  // can only deal with strings of same length
614  ASSERT(alen==blen)
615  ASSERT(shift_len==alen)
616 
617  float64_t result = 0 ;
618  if (position_weights_lhs!=NULL || position_weights_rhs!=NULL)
619  {
620  ASSERT(max_mismatch==0)
621  float64_t* position_weights_rhs_ = position_weights_rhs ;
622  if (lhs==rhs)
623  position_weights_rhs_ = position_weights_lhs ;
624 
625  result = compute_without_mismatch_position_weights(avec, &position_weights_lhs[idx_a*alen], alen, bvec, &position_weights_rhs_[idx_b*blen], blen) ;
626  }
627  else if (max_mismatch > 0)
628  result = compute_with_mismatch(avec, alen, bvec, blen) ;
629  else if (length==0)
630  result = compute_without_mismatch(avec, alen, bvec, blen) ;
631  else
632  result = compute_without_mismatch_matrix(avec, alen, bvec, blen) ;
633 
634  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
635  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
636 
637  return result ;
638 }
639 
640 
642  int32_t idx, float64_t alpha)
643 {
648 
649  int32_t len=0;
650  bool free_vec;
651  char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
652  ASSERT(max_mismatch==0)
653  int32_t *vec = SG_MALLOC(int32_t, len);
654 
655  for (int32_t i=0; i<len; i++)
656  vec[i]=alphabet->remap_to_bin(char_vec[i]);
657  ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
658 
660  {
661  //TRIES(set_use_compact_terminal_nodes(false)) ;
662  ASSERT(!TRIES(get_use_compact_terminal_nodes()))
663  }
664 
665  for (int32_t i=0; i<len; i++)
666  {
667  int32_t max_s=-1;
668 
670  max_s=0;
671  else if (opt_type==FASTBUTMEMHUNGRY)
672  max_s=shift[i];
673  else {
674  SG_ERROR("unknown optimization type\n")
675  }
676 
677  for (int32_t s=max_s; s>=0; s--)
678  {
679  float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
680  TRIES(add_to_trie(i, s, vec, alpha_pw, weights, (length!=0))) ;
681  if ((s==0) || (i+s>=len))
682  continue;
683 
684  TRIES(add_to_trie(i+s, -s, vec, alpha_pw, weights, (length!=0))) ;
685  }
686  }
687 
688  SG_FREE(vec);
689  tree_initialized=true ;
690 }
691 
693  int32_t idx, float64_t alpha, int32_t tree_num)
694 {
699 
700  int32_t len=0;
701  bool free_vec;
702  char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
703  ASSERT(max_mismatch==0)
704  int32_t *vec=SG_MALLOC(int32_t, len);
705  int32_t max_s=-1;
706 
708  max_s=0;
709  else if (opt_type==FASTBUTMEMHUNGRY)
710  {
712  max_s=shift[tree_num];
713  }
714  else {
715  SG_ERROR("unknown optimization type\n")
716  }
717  for (int32_t i=CMath::max(0,tree_num-max_shift);
718  i<CMath::min(len,tree_num+degree+max_shift); i++)
719  {
720  vec[i]=alphabet->remap_to_bin(char_vec[i]);
721  }
722  ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
723 
724  for (int32_t s=max_s; s>=0; s--)
725  {
726  float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
727  tries.add_to_trie(tree_num, s, vec, alpha_pw, weights, (length!=0)) ;
728  }
729 
731  {
732  for (int32_t i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+max_shift+1); i++)
733  {
734  int32_t s=tree_num-i;
735  if ((i+s<len) && (s>=1) && (s<=shift[i]))
736  {
737  float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
738  tries.add_to_trie(tree_num, -s, vec, alpha_pw, weights, (length!=0)) ;
739  }
740  }
741  }
742  SG_FREE(vec);
743  tree_initialized=true ;
744 }
745 
747 {
752 
753  float64_t sum=0;
754  int32_t len=0;
755  bool free_vec;
756  char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
757  ASSERT(max_mismatch==0)
758  int32_t *vec=SG_MALLOC(int32_t, len);
759 
760  for (int32_t i=0; i<len; i++)
761  vec[i]=alphabet->remap_to_bin(char_vec[i]);
762 
763  ((CStringFeatures<char>*) rhs)->free_feature_vector(char_vec, idx, free_vec);
764 
765  for (int32_t i=0; i<len; i++)
766  sum += tries.compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)) ;
767 
769  {
770  for (int32_t i=0; i<len; i++)
771  {
772  for (int32_t s=1; (s<=shift[i]) && (i+s<len); s++)
773  {
774  sum+=tries.compute_by_tree_helper(vec, len, i, i+s, i, weights, (length!=0))/(2*s) ;
775  sum+=tries.compute_by_tree_helper(vec, len, i+s, i, i+s, weights, (length!=0))/(2*s) ;
776  }
777  }
778  }
779 
780  SG_FREE(vec);
781 
782  return normalizer->normalize_rhs(sum, idx);
783 }
784 
786  int32_t idx, float64_t* LevelContrib)
787 {
792 
793  int32_t len=0;
794  bool free_vec;
795  char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
796  ASSERT(max_mismatch==0)
797  int32_t *vec=SG_MALLOC(int32_t, len);
798 
799  for (int32_t i=0; i<len; i++)
800  vec[i]=alphabet->remap_to_bin(char_vec[i]);
801 
802  ((CStringFeatures<char>*) rhs)->free_feature_vector(char_vec, idx, free_vec);
803 
804  for (int32_t i=0; i<len; i++)
805  {
806  tries.compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
808  (length!=0));
809  }
810 
812  {
813  for (int32_t i=0; i<len; i++)
814  for (int32_t k=1; (k<=shift[i]) && (i+k<len); k++)
815  {
816  tries.compute_by_tree_helper(vec, len, i, i+k, i, LevelContrib,
817  normalizer->normalize_rhs(1.0/(2*k), idx), mkl_stepsize,
818  weights, (length!=0)) ;
819  tries.compute_by_tree_helper(vec, len, i+k, i, i+k,
820  LevelContrib, normalizer->normalize_rhs(1.0/(2*k), idx),
821  mkl_stepsize, weights, (length!=0)) ;
822  }
823  }
824 
825  SG_FREE(vec);
826 }
827 
829  int32_t &len)
830 {
831  return tries.compute_abs_weights(len);
832 }
833 
835 {
836  SG_FREE(shift);
837 
838  shift_len = shifts.vlen;
839  shift = SG_MALLOC(int32_t, shift_len);
840 
841  if (shift)
842  {
843  max_shift = 0 ;
844 
845  for (int32_t i=0; i<shift_len; i++)
846  {
847  shift[i] = shifts.vector[i] ;
849  }
850 
851  ASSERT(max_shift>=0 && max_shift<=shift_len)
852  }
853 }
854 
856 {
857  ASSERT(degree>0)
858 
859  SG_FREE(weights);
860  weights=SG_MALLOC(float64_t, degree);
862  weights_length=1;
863 
864  if (weights)
865  {
866  int32_t i;
867  float64_t sum=0;
868  for (i=0; i<degree; i++)
869  {
870  weights[i]=degree-i;
871  sum+=weights[i];
872  }
873  for (i=0; i<degree; i++)
874  weights[i]/=sum;
875 
876  for (i=0; i<degree; i++)
877  {
878  for (int32_t j=1; j<=max_mismatch; j++)
879  {
880  if (j<i+1)
881  {
882  int32_t nk=CMath::nchoosek(i+1, j);
883  weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
884  }
885  else
886  weights[i+j*degree]= 0;
887  }
888  }
889 
890  return true;
891  }
892  else
893  return false;
894 }
895 
897 {
898  float64_t* ws=new_weights.matrix;
899  int32_t d=new_weights.num_rows;
900  int32_t len=new_weights.num_cols;
901 
902  if (d!=degree || len<0)
903  SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree)
904 
905  degree=d;
906  length=len;
907 
908  if (len <= 0)
909  len=1;
910 
913 
914  SG_DEBUG("Creating weights of size %dx%d\n", weights_degree, weights_length)
915  int32_t num_weights=weights_degree*weights_length;
916  SG_FREE(weights);
917  weights=SG_MALLOC(float64_t, num_weights);
918 
919  for (int32_t i=0; i<degree*len; i++)
920  weights[i]=ws[i];
921 
922  return true;
923 }
924 
926 {
927  if (seq_length==0)
928  seq_length=pws.vlen;
929 
930  if (seq_length!=pws.vlen)
931  SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, pws.vlen)
932 
933  SG_FREE(position_weights);
934  position_weights=SG_MALLOC(float64_t, pws.vlen);
937 
938  for (int32_t i=0; i<pws.vlen; i++)
939  position_weights[i]=pws.vector[i];
940 }
941 
943 {
946  else
948 
949  if (len==0)
950  {
952  }
953 
954  if (seq_length!=len)
955  {
956  SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len)
957  return false;
958  }
959 
960  SG_FREE(position_weights_lhs);
961  position_weights_lhs=SG_MALLOC(float64_t, len*num);
962  position_weights_lhs_len=len*num;
963 
964  for (int32_t i=0; i<len*num; i++)
965  position_weights_lhs[i]=pws[i];
966 
967  return true;
968 }
969 
971  float64_t* pws, int32_t len, int32_t num)
972 {
973  if (len==0)
974  {
976  {
978  return true;
979  }
981  }
982 
983  if (seq_length!=len)
984  {
985  SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len)
986  return false;
987  }
988 
989  SG_FREE(position_weights_rhs);
990  position_weights_rhs=SG_MALLOC(float64_t, len*num);
991  position_weights_rhs_len=len*num;
992 
993  for (int32_t i=0; i<len*num; i++)
994  position_weights_rhs[i]=pws[i];
995 
996  return true;
997 }
998 
1000 {
1001  SG_FREE(block_weights);
1003 
1004  if (block_weights)
1005  {
1006  int32_t k;
1007  float64_t d=degree; // use float to evade rounding errors below
1008 
1009  for (k=0; k<degree; k++)
1010  block_weights[k]=
1011  (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
1012  for (k=degree; k<seq_length; k++)
1013  block_weights[k]=(-d+3*k+4)/3;
1014  }
1015 
1016  return (block_weights!=NULL);
1017 }
1018 
1020 {
1021  ASSERT(weights)
1022  SG_FREE(block_weights);
1024 
1025  if (block_weights)
1026  {
1027  int32_t i=0;
1028  block_weights[0]=weights[0];
1029  for (i=1; i<CMath::max(seq_length,degree); i++)
1030  block_weights[i]=0;
1031 
1032  for (i=1; i<CMath::max(seq_length,degree); i++)
1033  {
1034  block_weights[i]=block_weights[i-1];
1035 
1036  float64_t contrib=0;
1037  for (int32_t j=0; j<CMath::min(degree,i+1); j++)
1038  contrib+=weights[j];
1039 
1040  block_weights[i]+=contrib;
1041  }
1042  }
1043 
1044  return (block_weights!=NULL);
1045 }
1046 
1048 {
1049  SG_FREE(block_weights);
1050  block_weights=SG_MALLOC(float64_t, seq_length);
1051 
1052  if (block_weights)
1053  {
1054  for (int32_t i=1; i<seq_length+1 ; i++)
1055  block_weights[i-1]=1.0/seq_length;
1056  }
1057 
1058  return (block_weights!=NULL);
1059 }
1060 
1062 {
1063  SG_FREE(block_weights);
1064  block_weights=SG_MALLOC(float64_t, seq_length);
1065 
1066  if (block_weights)
1067  {
1068  for (int32_t i=1; i<seq_length+1 ; i++)
1069  block_weights[i-1]=degree*i;
1070  }
1071 
1072  return (block_weights!=NULL);
1073 }
1074 
1076 {
1077  SG_FREE(block_weights);
1078  block_weights=SG_MALLOC(float64_t, seq_length);
1079 
1080  if (block_weights)
1081  {
1082  for (int32_t i=1; i<degree+1 ; i++)
1083  block_weights[i-1]=((float64_t) i)*i;
1084 
1085  for (int32_t i=degree+1; i<seq_length+1 ; i++)
1086  block_weights[i-1]=i;
1087  }
1088 
1089  return (block_weights!=NULL);
1090 }
1091 
1093 {
1094  SG_FREE(block_weights);
1095  block_weights=SG_MALLOC(float64_t, seq_length);
1096 
1097  if (block_weights)
1098  {
1099  for (int32_t i=1; i<degree+1 ; i++)
1100  block_weights[i-1]=((float64_t) i)*i*i;
1101 
1102  for (int32_t i=degree+1; i<seq_length+1 ; i++)
1103  block_weights[i-1]=i;
1104  }
1105 
1106  return (block_weights!=NULL);
1107 }
1108 
1110 {
1111  SG_FREE(block_weights);
1112  block_weights=SG_MALLOC(float64_t, seq_length);
1113 
1114  if (block_weights)
1115  {
1116  for (int32_t i=1; i<degree+1 ; i++)
1117  block_weights[i-1]=exp(((float64_t) i/10.0));
1118 
1119  for (int32_t i=degree+1; i<seq_length+1 ; i++)
1120  block_weights[i-1]=i;
1121  }
1122 
1123  return (block_weights!=NULL);
1124 }
1125 
1127 {
1128  SG_FREE(block_weights);
1129  block_weights=SG_MALLOC(float64_t, seq_length);
1130 
1131  if (block_weights)
1132  {
1133  for (int32_t i=1; i<degree+1 ; i++)
1135 
1136  for (int32_t i=degree+1; i<seq_length+1 ; i++)
1137  block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
1138  }
1139 
1140  return (block_weights!=NULL);
1141 }
1142 
1144 {
1145  switch (type)
1146  {
1147  case E_WD:
1148  return init_block_weights_from_wd();
1149  case E_EXTERNAL:
1151  case E_BLOCK_CONST:
1152  return init_block_weights_const();
1153  case E_BLOCK_LINEAR:
1154  return init_block_weights_linear();
1155  case E_BLOCK_SQPOLY:
1156  return init_block_weights_sqpoly();
1157  case E_BLOCK_CUBICPOLY:
1159  case E_BLOCK_EXP:
1160  return init_block_weights_exp();
1161  case E_BLOCK_LOG:
1162  return init_block_weights_log();
1163  };
1164  return false;
1165 }
1166 
1167 
1168 
1170 {
1171  S_THREAD_PARAM_WDS<DNATrie>* params = (S_THREAD_PARAM_WDS<DNATrie>*) p;
1172  int32_t j=params->j;
1173  CWeightedDegreePositionStringKernel* wd=params->kernel;
1174  CTrie<DNATrie>* tries=params->tries;
1175  float64_t* weights=params->weights;
1176  int32_t length=params->length;
1177  int32_t max_shift=params->max_shift;
1178  int32_t* vec=params->vec;
1179  float64_t* result=params->result;
1180  float64_t factor=params->factor;
1181  int32_t* shift=params->shift;
1182  int32_t* vec_idx=params->vec_idx;
1183 
1184  for (int32_t i=params->start; i<params->end; i++)
1185  {
1186  int32_t len=0;
1188  CAlphabet* alpha=wd->alphabet;
1189 
1190  bool free_vec;
1191  char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
1192  for (int32_t k=CMath::max(0,j-max_shift); k<CMath::min(len,j+wd->get_degree()+max_shift); k++)
1193  vec[k]=alpha->remap_to_bin(char_vec[k]);
1194  rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
1195 
1196  SG_UNREF(rhs_feat);
1197 
1198  result[i] += factor*wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
1199 
1201  {
1202  for (int32_t q=CMath::max(0,j-max_shift); q<CMath::min(len,j+max_shift+1); q++)
1203  {
1204  int32_t s=j-q ;
1205  if ((s>=1) && (s<=shift[q]) && (q+s<len))
1206  {
1207  result[i] +=
1209  len, q, q+s, q, weights, (length!=0)),
1210  vec_idx[i])/(2.0*s);
1211  }
1212  }
1213 
1214  for (int32_t s=1; (s<=shift[j]) && (j+s<len); s++)
1215  {
1216  result[i] +=
1218  len, j+s, j, j+s, weights, (length!=0)),
1219  vec_idx[i])/(2.0*s);
1220  }
1221  }
1222  }
1223 
1224  return NULL;
1225 }
1226 
1228  int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
1229  int32_t* IDX, float64_t* alphas, float64_t factor)
1230 {
1231  ASSERT(alphabet)
1235  ASSERT(rhs)
1236  ASSERT(num_vec<=rhs->get_num_vectors())
1237  ASSERT(num_vec>0)
1238  ASSERT(vec_idx)
1239  ASSERT(result)
1241 
1242  int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
1243  ASSERT(num_feat>0)
1244  // TODO: port to use OpenMP backend instead of pthread
1245 #ifdef HAVE_PTHREAD
1246  int32_t num_threads=parallel->get_num_threads();
1247 #else
1248  int32_t num_threads=1;
1249 #endif
1250  ASSERT(num_threads>0)
1251  int32_t* vec=SG_MALLOC(int32_t, num_threads*num_feat);
1252 
1253  if (num_threads < 2)
1254  {
1255 
1256  auto pb = progress(range(num_feat), *this->io);
1257  // TODO: replace with the new signal
1258  // for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
1259  for (int32_t j = 0; j < num_feat; j++)
1260  {
1261  init_optimization(num_suppvec, IDX, alphas, j);
1262  S_THREAD_PARAM_WDS<DNATrie> params;
1263  params.vec = vec;
1264  params.result = result;
1265  params.weights = weights;
1266  params.kernel = this;
1267  params.tries = &tries;
1268  params.factor = factor;
1269  params.j = j;
1270  params.start = 0;
1271  params.end = num_vec;
1272  params.length = length;
1273  params.max_shift = max_shift;
1274  params.shift = shift;
1275  params.vec_idx = vec_idx;
1276  compute_batch_helper((void*)&params);
1277 
1278  pb.print_progress();
1279  }
1280  pb.complete();
1281  }
1282 #ifdef HAVE_PTHREAD
1283  else
1284  {
1285 
1286  auto pb = progress(range(num_feat), *this->io);
1287  // TODO: replace with the new signal
1288  // for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
1289  for (int32_t j = 0; j < num_feat; j++)
1290  {
1291  init_optimization(num_suppvec, IDX, alphas, j);
1292  pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
1293  S_THREAD_PARAM_WDS<DNATrie>* params = SG_MALLOC(S_THREAD_PARAM_WDS<DNATrie>, num_threads);
1294  int32_t step= num_vec/num_threads;
1295  int32_t t;
1296 
1297  for (t=0; t<num_threads-1; t++)
1298  {
1299  params[t].vec=&vec[num_feat*t];
1300  params[t].result=result;
1301  params[t].weights=weights;
1302  params[t].kernel=this;
1303  params[t].tries=&tries;
1304  params[t].factor=factor;
1305  params[t].j=j;
1306  params[t].start = t*step;
1307  params[t].end = (t+1)*step;
1308  params[t].length=length;
1309  params[t].max_shift=max_shift;
1310  params[t].shift=shift;
1311  params[t].vec_idx=vec_idx;
1312  pthread_create(&threads[t], NULL, CWeightedDegreePositionStringKernel::compute_batch_helper, (void*)&params[t]);
1313  }
1314 
1315  params[t].vec=&vec[num_feat*t];
1316  params[t].result=result;
1317  params[t].weights=weights;
1318  params[t].kernel=this;
1319  params[t].tries=&tries;
1320  params[t].factor=factor;
1321  params[t].j=j;
1322  params[t].start=t*step;
1323  params[t].end=num_vec;
1324  params[t].length=length;
1325  params[t].max_shift=max_shift;
1326  params[t].shift=shift;
1327  params[t].vec_idx=vec_idx;
1328  compute_batch_helper((void*) &params[t]);
1329 
1330  for (t=0; t<num_threads-1; t++)
1331  pthread_join(threads[t], NULL);
1332  pb.print_progress();
1333 
1334  SG_FREE(params);
1335  SG_FREE(threads);
1336  }
1337  pb.complete();
1338  }
1339 #endif
1340 
1341  SG_FREE(vec);
1342 
1343  //really also free memory as this can be huge on testing especially when
1344  //using the combined kernel
1346 }
1347 
1349  int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* result,
1350  int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
1351 {
1354 
1355  num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
1356  ASSERT(num_feat>0)
1357  ASSERT(alphabet)
1359 
1360  num_sym=4; //for now works only w/ DNA
1361 
1362  ASSERT(max_degree>0)
1363 
1364  // === variables
1365  int32_t* nofsKmers=SG_MALLOC(int32_t, max_degree);
1366  float64_t** C=SG_MALLOC(float64_t*, max_degree);
1367  float64_t** L=SG_MALLOC(float64_t*, max_degree);
1368  float64_t** R=SG_MALLOC(float64_t*, max_degree);
1369 
1370  int32_t i;
1371  int32_t k;
1372 
1373  // --- return table
1374  int32_t bigtabSize=0;
1375  for (k=0; k<max_degree; ++k )
1376  {
1377  nofsKmers[k]=(int32_t) CMath::pow(num_sym, k+1);
1378  const int32_t tabSize=nofsKmers[k]*num_feat;
1379  bigtabSize+=tabSize;
1380  }
1381  result=SG_MALLOC(float64_t, bigtabSize);
1382 
1383  // --- auxilliary tables
1384  int32_t tabOffs=0;
1385  for( k = 0; k < max_degree; ++k )
1386  {
1387  const int32_t tabSize = nofsKmers[k] * num_feat;
1388  C[k] = &result[tabOffs];
1389  L[k] = SG_MALLOC(float64_t, tabSize );
1390  R[k] = SG_MALLOC(float64_t, tabSize );
1391  tabOffs+=tabSize;
1392  for(i = 0; i < tabSize; i++ )
1393  {
1394  C[k][i] = 0.0;
1395  L[k][i] = 0.0;
1396  R[k][i] = 0.0;
1397  }
1398  }
1399 
1400  // --- tree parsing info
1401  float64_t* margFactors=SG_MALLOC(float64_t, degree);
1402 
1403  int32_t* x = SG_MALLOC(int32_t, degree+1 );
1404  int32_t* substrs = SG_MALLOC(int32_t, degree+1 );
1405  // - fill arrays
1406  margFactors[0] = 1.0;
1407  substrs[0] = 0;
1408  for( k=1; k < degree; ++k ) {
1409  margFactors[k] = 0.25 * margFactors[k-1];
1410  substrs[k] = -1;
1411  }
1412  substrs[degree] = -1;
1413  // - fill struct
1414  struct TreeParseInfo info;
1415  info.num_sym = num_sym;
1416  info.num_feat = num_feat;
1417  info.p = -1;
1418  info.k = -1;
1419  info.nofsKmers = nofsKmers;
1420  info.margFactors = margFactors;
1421  info.x = x;
1422  info.substrs = substrs;
1423  info.y0 = 0;
1424  info.C_k = NULL;
1425  info.L_k = NULL;
1426  info.R_k = NULL;
1427 
1428  // === main loop
1429  auto pb = progress(range(num_feat * max_degree), *this->io);
1430  for( k = 0; k < max_degree; ++k )
1431  {
1432  const int32_t nofKmers = nofsKmers[ k ];
1433  info.C_k = C[k];
1434  info.L_k = L[k];
1435  info.R_k = R[k];
1436 
1437  // --- run over all trees
1438  for(int32_t p = 0; p < num_feat; ++p )
1439  {
1440  init_optimization( num_suppvec, IDX, alphas, p );
1441  int32_t tree = p ;
1442  for(int32_t j = 0; j < degree+1; j++ ) {
1443  x[j] = -1;
1444  }
1445  tries.traverse( tree, p, info, 0, x, k );
1446  pb.print_progress();
1447  }
1448 
1449  // --- add partial overlap scores
1450  if( k > 0 ) {
1451  const int32_t j = k - 1;
1452  const int32_t nofJmers = (int32_t) CMath::pow( num_sym, j+1 );
1453  for(int32_t p = 0; p < num_feat; ++p ) {
1454  const int32_t offsetJ = nofJmers * p;
1455  const int32_t offsetJ1 = nofJmers * (p+1);
1456  const int32_t offsetK = nofKmers * p;
1457  int32_t y;
1458  int32_t sym;
1459  for( y = 0; y < nofJmers; ++y ) {
1460  for( sym = 0; sym < num_sym; ++sym ) {
1461  const int32_t y_sym = num_sym*y + sym;
1462  const int32_t sym_y = nofJmers*sym + y;
1463  ASSERT(0<=y_sym && y_sym<nofKmers)
1464  ASSERT(0<=sym_y && sym_y<nofKmers)
1465  C[k][ y_sym + offsetK ] += L[j][ y + offsetJ ];
1466  if( p < num_feat-1 ) {
1467  C[k][ sym_y + offsetK ] += R[j][ y + offsetJ1 ];
1468  }
1469  }
1470  }
1471  }
1472  }
1473  // if( k > 1 )
1474  // j = k-1
1475  // for all positions p
1476  // for all j-mers y
1477  // for n in {A,C,G,T}
1478  // C_k[ p, [y,n] ] += L_j[ p, y ]
1479  // C_k[ p, [n,y] ] += R_j[ p+1, y ]
1480  // end;
1481  // end;
1482  // end;
1483  // end;
1484  }
1485  pb.complete();
1486 
1487  // === return a vector
1488  num_feat=1;
1489  num_sym = bigtabSize;
1490  // --- clean up
1491  SG_FREE(nofsKmers);
1492  SG_FREE(margFactors);
1493  SG_FREE(substrs);
1494  SG_FREE(x);
1495  SG_FREE(C);
1496  for( k = 0; k < max_degree; ++k ) {
1497  SG_FREE(L[k]);
1498  SG_FREE(R[k]);
1499  }
1500  SG_FREE(L);
1501  SG_FREE(R);
1502  return result;
1503 }
1504 
1506  int32_t &num_feat, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
1507 {
1510  //only works for order <= 32
1511  ASSERT(degree<=32)
1513  num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
1514  ASSERT(num_feat>0)
1515  ASSERT(alphabet)
1517 
1518  //consensus
1519  char* result=SG_MALLOC(char, num_feat);
1520 
1521  //backtracking and scoring table
1522  int32_t num_tables=CMath::max(1,num_feat-degree+1);
1523  DynArray<ConsensusEntry>** table=SG_MALLOC(DynArray<ConsensusEntry>*, num_tables);
1524 
1525  for (int32_t i=0; i<num_tables; i++)
1526  table[i]=new DynArray<ConsensusEntry>(num_suppvec/10);
1527 
1528  //compute consensus via dynamic programming
1529  for (auto i : progress(range(num_tables), *this->io))
1530  {
1531  bool cumulative=false;
1532 
1533  if (i<num_tables-1)
1534  init_optimization(num_suppvec, IDX, alphas, i);
1535  else
1536  {
1537  init_optimization(num_suppvec, IDX, alphas, i, num_feat-1);
1538  cumulative=true;
1539  }
1540 
1541  if (i==0)
1542  tries.fill_backtracking_table(i, NULL, table[i], cumulative, weights);
1543  else
1544  tries.fill_backtracking_table(i, table[i-1], table[i], cumulative, weights);
1545  }
1546 
1547 
1548  //int32_t n=table[0]->get_num_elements();
1549 
1550  //for (int32_t i=0; i<n; i++)
1551  //{
1552  // ConsensusEntry e= table[0]->get_element(i);
1553  // SG_PRint32_t("first: str:0%0llx sc:%f bt:%d\n",e.string,e.score,e.bt);
1554  //}
1555 
1556  //n=table[num_tables-1]->get_num_elements();
1557  //for (int32_t i=0; i<n; i++)
1558  //{
1559  // ConsensusEntry e= table[num_tables-1]->get_element(i);
1560  // SG_PRint32_t("last: str:0%0llx sc:%f bt:%d\n",e.string,e.score,e.bt);
1561  //}
1562  //n=table[num_tables-2]->get_num_elements();
1563  //for (int32_t i=0; i<n; i++)
1564  //{
1565  // ConsensusEntry e= table[num_tables-2]->get_element(i);
1566  // SG_PRINT("second last: str:0%0llx sc:%f bt:%d\n",e.string,e.score,e.bt)
1567  //}
1568 
1569  const char* acgt="ACGT";
1570 
1571  //backtracking start
1572  int32_t max_idx=-1;
1573  float32_t max_score=0;
1574  int32_t num_elements=table[num_tables-1]->get_num_elements();
1575 
1576  for (int32_t i=0; i<num_elements; i++)
1577  {
1578  float64_t sc=table[num_tables-1]->get_element(i).score;
1579  if (sc>max_score || max_idx==-1)
1580  {
1581  max_idx=i;
1582  max_score=sc;
1583  }
1584  }
1585  uint64_t endstr=table[num_tables-1]->get_element(max_idx).string;
1586 
1587  SG_INFO("max_idx:%d num_el:%d num_feat:%d num_tables:%d max_score:%f\n", max_idx, num_elements, num_feat, num_tables, max_score)
1588 
1589  for (int32_t i=0; i<degree; i++)
1590  result[num_feat-1-i]=acgt[(endstr >> (2*i)) & 3];
1591 
1592  if (num_tables>1)
1593  {
1594  for (int32_t i=num_tables-1; i>=0; i--)
1595  {
1596  //SG_PRINT("max_idx: %d, i:%d\n", max_idx, i)
1597  result[i]=acgt[table[i]->get_element(max_idx).string >> (2*(degree-1)) & 3];
1598  max_idx=table[i]->get_element(max_idx).bt;
1599  }
1600  }
1601 
1602  //for (int32_t t=0; t<num_tables; t++)
1603  //{
1604  // n=table[t]->get_num_elements();
1605  // for (int32_t i=0; i<n; i++)
1606  // {
1607  // ConsensusEntry e= table[t]->get_element(i);
1608  // SG_PRINT("table[%d,%d]: str:0%0llx sc:%+f bt:%d\n",t,i, e.string,e.score,e.bt)
1609  // }
1610  //}
1611 
1612  for (int32_t i=0; i<num_tables; i++)
1613  delete table[i];
1614 
1615  SG_FREE(table);
1616  return result;
1617 }
1618 
1619 
1621  int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
1622  float64_t* w_result, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
1623 {
1625  use_poim_tries=true;
1626  poim_tries.delete_trees(false);
1627 
1628  // === check
1631  num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
1632  ASSERT(num_feat>0)
1634  ASSERT(max_degree>0)
1635 
1636  // === general variables
1637  static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
1638  const int32_t seqLen = num_feat;
1639  float64_t** subs;
1640  int32_t i;
1641  int32_t k;
1642  //int32_t y;
1643 
1644  // === init tables "subs" for substring scores / POIMs
1645  // --- compute table sizes
1646  int32_t* offsets;
1647  int32_t offset;
1648  offsets = SG_MALLOC(int32_t, max_degree );
1649  offset = 0;
1650  for( k = 0; k < max_degree; ++k ) {
1651  offsets[k] = offset;
1652  const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
1653  const int32_t tabSize = nofsKmers * seqLen;
1654  offset += tabSize;
1655  }
1656  // --- allocate memory
1657  const int32_t bigTabSize = offset;
1658  w_result=SG_MALLOC(float64_t, bigTabSize);
1659  for (i=0; i<bigTabSize; ++i)
1660  w_result[i]=0;
1661 
1662  // --- set pointers for tables
1663  subs = SG_MALLOC(float64_t*, max_degree );
1664  ASSERT( subs != NULL )
1665  for( k = 0; k < max_degree; ++k ) {
1666  subs[k] = &w_result[ offsets[k] ];
1667  }
1668  SG_FREE(offsets);
1669 
1670  // === init trees; extract "w"
1671  init_optimization( num_suppvec, IDX, alphas, -1);
1672  poim_tries.POIMs_extract_W( subs, max_degree );
1673 
1674  // === clean; return "subs" as vector
1675  SG_FREE(subs);
1676  num_feat = 1;
1677  num_sym = bigTabSize;
1678  use_poim_tries=false;
1679  poim_tries.delete_trees(false);
1680  return w_result;
1681 }
1682 
1684  int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
1685  float64_t* poim_result, int32_t num_suppvec, int32_t* IDX,
1686  float64_t* alphas, float64_t* distrib )
1687 {
1689  use_poim_tries=true;
1690  poim_tries.delete_trees(false);
1691 
1692  // === check
1695  num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
1696  ASSERT(num_feat>0)
1698  ASSERT(max_degree!=0)
1699  ASSERT(distrib)
1700 
1701  // === general variables
1702  static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
1703  const int32_t seqLen = num_feat;
1704  float64_t** subs;
1705  int32_t i;
1706  int32_t k;
1707 
1708  // === DEBUGGING mode
1709  //
1710  // Activated if "max_degree" < 0.
1711  // Allows to output selected partial score.
1712  //
1713  // |max_degree| mod 4
1714  // 0: substring
1715  // 1: superstring
1716  // 2: left overlap
1717  // 3: right overlap
1718  //
1719  const int32_t debug = ( max_degree < 0 ) ? ( abs(max_degree) % 4 + 1 ) : 0;
1720  if( debug ) {
1721  max_degree = abs(max_degree) / 4;
1722  switch( debug ) {
1723  case 1: {
1724  printf( "POIM DEBUGGING: substring only (max order=%d)\n", max_degree );
1725  break;
1726  }
1727  case 2: {
1728  printf( "POIM DEBUGGING: superstring only (max order=%d)\n", max_degree );
1729  break;
1730  }
1731  case 3: {
1732  printf( "POIM DEBUGGING: left overlap only (max order=%d)\n", max_degree );
1733  break;
1734  }
1735  case 4: {
1736  printf( "POIM DEBUGGING: right overlap only (max order=%d)\n", max_degree );
1737  break;
1738  }
1739  default: {
1740  printf( "POIM DEBUGGING: something is wrong (max order=%d)\n", max_degree );
1741  ASSERT(0)
1742  break;
1743  }
1744  }
1745  }
1746 
1747  // --- compute table sizes
1748  int32_t* offsets;
1749  int32_t offset;
1750  offsets = SG_MALLOC(int32_t, max_degree );
1751  offset = 0;
1752  for( k = 0; k < max_degree; ++k ) {
1753  offsets[k] = offset;
1754  const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
1755  const int32_t tabSize = nofsKmers * seqLen;
1756  offset += tabSize;
1757  }
1758  // --- allocate memory
1759  const int32_t bigTabSize=offset;
1760  poim_result=SG_MALLOC(float64_t, bigTabSize);
1761  for (i=0; i<bigTabSize; ++i )
1762  poim_result[i]=0;
1763 
1764  // --- set pointers for tables
1765  subs=SG_MALLOC(float64_t*, max_degree);
1766  for (k=0; k<max_degree; ++k)
1767  subs[k]=&poim_result[offsets[k]];
1768 
1769  SG_FREE(offsets);
1770 
1771  // === init trees; precalc S, L and R
1772  init_optimization( num_suppvec, IDX, alphas, -1);
1773  poim_tries.POIMs_precalc_SLR( distrib );
1774 
1775  // === compute substring scores
1776  if( debug==0 || debug==1 ) {
1777  poim_tries.POIMs_extract_W( subs, max_degree );
1778  for( k = 1; k < max_degree; ++k ) {
1779  const int32_t nofKmers2 = ( k > 1 ) ? (int32_t) CMath::pow(NUM_SYMS,k-1) : 0;
1780  const int32_t nofKmers1 = (int32_t) CMath::pow( NUM_SYMS, k );
1781  const int32_t nofKmers0 = nofKmers1 * NUM_SYMS;
1782  for( i = 0; i < seqLen; ++i ) {
1783  float64_t* const subs_k2i1 = ( k>1 && i<seqLen-1 ) ? &subs[k-2][(i+1)*nofKmers2] : NULL;
1784  float64_t* const subs_k1i1 = ( i < seqLen-1 ) ? &subs[k-1][(i+1)*nofKmers1] : NULL;
1785  float64_t* const subs_k1i0 = & subs[ k-1 ][ i*nofKmers1 ];
1786  float64_t* const subs_k0i = & subs[ k-0 ][ i*nofKmers0 ];
1787  int32_t y0;
1788  for( y0 = 0; y0 < nofKmers0; ++y0 ) {
1789  const int32_t y1l = y0 / NUM_SYMS;
1790  const int32_t y1r = y0 % nofKmers1;
1791  const int32_t y2 = y1r / NUM_SYMS;
1792  subs_k0i[ y0 ] += subs_k1i0[ y1l ];
1793  if( i < seqLen-1 ) {
1794  subs_k0i[ y0 ] += subs_k1i1[ y1r ];
1795  if( k > 1 ) {
1796  subs_k0i[ y0 ] -= subs_k2i1[ y2 ];
1797  }
1798  }
1799  }
1800  }
1801  }
1802  }
1803 
1804  // === compute POIMs
1805  poim_tries.POIMs_add_SLR( subs, max_degree, debug );
1806 
1807  // === clean; return "subs" as vector
1808  SG_FREE(subs);
1809  num_feat = 1;
1810  num_sym = bigTabSize;
1811 
1812  use_poim_tries=false;
1813  poim_tries.delete_trees(false);
1814 
1815  return poim_result;
1816 }
1817 
1818 
1820 {
1821  SG_FREE(m_poim_distrib);
1822  int32_t num_sym=distrib.num_cols;
1823  int32_t num_feat=distrib.num_rows;
1824  m_poim_distrib=SG_MALLOC(float64_t, num_sym*num_feat);
1825  sg_memcpy(m_poim_distrib, distrib.matrix, num_sym*num_feat*sizeof(float64_t));
1826  m_poim_num_sym=num_sym;
1827  m_poim_num_feat=num_feat;
1828 }
1829 
1831  int32_t max_degree, CSVM* svm)
1832 {
1833  ASSERT(svm)
1834  int32_t num_suppvec=svm->get_num_support_vectors();
1835  int32_t* sv_idx=SG_MALLOC(int32_t, num_suppvec);
1836  float64_t* sv_weight=SG_MALLOC(float64_t, num_suppvec);
1837 
1838  for (int32_t i=0; i<num_suppvec; i++)
1839  {
1840  sv_idx[i]=svm->get_support_vector(i);
1841  sv_weight[i]=svm->get_alpha(i);
1842  }
1843 
1844  if ((max_degree < 1) || (max_degree > 12))
1845  {
1846  //SG_WARNING("max_degree out of range 1..12 (%d).\n", max_degree)
1847  SG_WARNING("max_degree out of range 1..12 (%d). setting to 1.\n", max_degree)
1848  max_degree=1;
1849  }
1850 
1851  int32_t num_feat = m_poim_num_feat;
1852  int32_t num_sym = m_poim_num_sym;
1853  SG_FREE(m_poim);
1854 
1855  m_poim = compute_POIM(max_degree, num_feat, num_sym, NULL, num_suppvec, sv_idx,
1856  sv_weight, m_poim_distrib);
1857 
1858  ASSERT(num_feat==1)
1859  m_poim_result_len=num_sym;
1860 
1861  SG_FREE(sv_weight);
1862  SG_FREE(sv_idx);
1863 }
1864 
1866 {
1868  return poim;
1869 }
1870 
1872 {
1873  SG_FREE(m_poim) ;
1874  m_poim=NULL ;
1875  SG_FREE(m_poim_distrib) ;
1876  m_poim_distrib=NULL ;
1877  m_poim_num_sym=0 ;
1878  m_poim_num_sym=0 ;
1879  m_poim_result_len=0 ;
1880 }
1881 
1883 {
1885 
1888 
1889  if (weights)
1891 }
1892 
1893 void CWeightedDegreePositionStringKernel::init()
1894 {
1895  weights=NULL;
1896  weights_length=0;
1897  weights_degree=0;
1898  position_weights=NULL;
1900 
1901  position_weights_lhs=NULL;
1903  position_weights_rhs=NULL;
1905 
1906  weights_buffer=NULL;
1907  mkl_stepsize=1;
1908  degree=1;
1909  length=0;
1910 
1911  max_shift=0;
1912  max_mismatch=0;
1913  seq_length=0;
1914  shift=NULL;
1915  shift_len=0;
1916 
1917  block_weights=NULL;
1918  block_computation=true;
1919  type=E_EXTERNAL;
1920  which_degree=-1;
1921  tries=CTrie<DNATrie>(1);
1923 
1924  tree_initialized=false;
1925  use_poim_tries=false;
1926  m_poim_distrib=NULL;
1927 
1928  m_poim=NULL;
1929  m_poim_num_sym=0;
1930  m_poim_num_feat=0;
1932 
1933  alphabet=NULL;
1934 
1936 
1938 
1940  "weights", "WD Kernel weights.");
1942  "position_weights",
1943  "Weights per position.");
1945  "position_weights_lhs",
1946  "Weights per position left hand side.");
1948  "position_weights_rhs",
1949  "Weights per position right hand side.");
1951  "shift",
1952  "Shift Vector.");
1953  SG_ADD(&max_shift, "max_shift", "Maximal shift.", MS_AVAILABLE);
1954  SG_ADD(&mkl_stepsize, "mkl_stepsize", "MKL step size.", MS_AVAILABLE);
1955  SG_ADD(&degree, "degree", "Order of WD kernel.", MS_AVAILABLE);
1956  SG_ADD(&max_mismatch, "max_mismatch",
1957  "Number of allowed mismatches.", MS_AVAILABLE);
1958  SG_ADD(&block_computation, "block_computation",
1959  "If block computation shall be used.", MS_NOT_AVAILABLE);
1960  SG_ADD((machine_int_t*) &type, "type",
1961  "WeightedDegree kernel type.", MS_AVAILABLE);
1962  SG_ADD(&which_degree, "which_degree",
1963  "The selected degree. All degrees are used by default (for value -1).",
1964  MS_AVAILABLE);
1965  SG_ADD((CSGObject**) &alphabet, "alphabet",
1966  "Alphabet of Features.", MS_NOT_AVAILABLE);
1967 }
RNA - letters A,C,G,U.
Definition: Alphabet.h:32
virtual void load_serializable_post()
Definition: Kernel.cpp:905
virtual float64_t compute(int32_t idx_a, int32_t idx_b)
int32_t NUM_SYMS
Definition: Trie.h:633
float64_t * compute_scoring(int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *target, int32_t num_suppvec, int32_t *IDX, float64_t *weights)
#define SG_INFO(...)
Definition: SGIO.h:117
virtual void cleanup()
Definition: Kernel.cpp:172
T get_element(int32_t index) const
Definition: DynArray.h:142
virtual void compute_batch(int32_t num_vec, int32_t *vec_idx, float64_t *target, int32_t num_suppvec, int32_t *IDX, float64_t *alphas, float64_t factor=1.0)
void add_example_to_single_tree(int32_t idx, float64_t weight, int32_t tree_num)
DNA - letters A,C,G,T.
Definition: Alphabet.h:26
float64_t * compute_abs_weights(int32_t &len)
Definition: Trie.h:1214
static T max(T a, T b)
Definition: Math.h:149
int32_t get_num_threads() const
Definition: Parallel.cpp:97
void fill_backtracking_table(int32_t pos, DynArray< ConsensusEntry > *prev, DynArray< ConsensusEntry > *cur, bool cumulative, float64_t *weights)
Definition: Trie.h:2087
PRange< T > progress(Range< T > range, const SGIO &io, std::string prefix="PROGRESS: ", SG_PRG_MODE mode=UTF8, std::function< bool()> condition=[](){return true;})
Definition: progress.h:712
float64_t compute_with_mismatch(char *avec, int32_t alen, char *bvec, int32_t blen)
Class ShogunException defines an exception which is thrown whenever an error inside of shogun occurs...
float64_t compute_by_tree_helper(int32_t *vec, int32_t len, int32_t seq_pos, int32_t tree_pos, int32_t weight_pos, float64_t *weights, bool degree_times_position_weights)
Definition: Trie.h:1722
virtual bool set_normalizer(CKernelNormalizer *normalizer)
Definition: Kernel.cpp:149
EAlphabet get_alphabet() const
Definition: Alphabet.h:130
virtual float64_t normalize_rhs(float64_t value, int32_t idx_rhs)=0
void set_use_compact_terminal_nodes(bool p_use_compact_terminal_nodes)
Definition: Trie.h:466
Template class Trie implements a suffix trie, i.e. a tree in which all suffixes up to a certain lengt...
Definition: Trie.h:136
float64_t * compute_POIM(int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *poim_result, int32_t num_suppvec, int32_t *IDX, float64_t *alphas, float64_t *distrib)
CFeatures * get_rhs()
#define SG_ERROR(...)
Definition: SGIO.h:128
void set_is_initialized(bool p_init)
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
Parameter * m_parameters
Definition: SGObject.h:609
int32_t get_num_elements() const
Definition: DynArray.h:130
void add_to_trie(int32_t i, int32_t seq_offset, int32_t *vec, float32_t alpha, float64_t *weights, bool degree_times_position_weights)
Definition: Trie.h:1496
Parallel * parallel
Definition: SGObject.h:603
Range< T > range(T rend)
Definition: range.h:136
uint8_t remap_to_bin(uint8_t c)
Definition: Alphabet.h:159
void create(int32_t len, bool p_use_compact_terminal_nodes=true)
Definition: Trie.h:1156
bool get_is_initialized()
void POIMs_extract_W(float64_t *const *const W, const int32_t K)
bool get_use_compact_terminal_nodes()
Definition: Trie.h:456
void traverse(int32_t tree, const int32_t p, struct TreeParseInfo info, const int32_t depth, int32_t *const x, const int32_t k)
Definition: Trie.h:1392
virtual int32_t get_vector_length(int32_t vec_num)
virtual bool init_optimization(int32_t p_count, int32_t *IDX, float64_t *alphas)
#define ASSERT(x)
Definition: SGIO.h:176
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:124
void POIMs_precalc_SLR(const float64_t *const distrib)
virtual void add_example_to_tree(int32_t idx, float64_t weight)
Template Dynamic array class that creates an array that can be used like a list or an array...
Definition: DynArray.h:32
double float64_t
Definition: common.h:60
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
EOptimizationType get_optimization_type()
float64_t compute_without_mismatch_matrix(char *avec, int32_t alen, char *bvec, int32_t blen)
index_t num_rows
Definition: SGMatrix.h:495
virtual void set_position_weights(SGVector< float64_t > pws)
float64_t get_alpha(int32_t idx)
void destroy()
Definition: Trie.h:1135
SGVector< ST > get_feature_vector(int32_t num)
index_t num_cols
Definition: SGMatrix.h:497
int32_t get_support_vector(int32_t idx)
void delete_trees(bool p_use_compact_terminal_nodes=true)
Definition: Trie.h:1171
virtual bool init_normalizer()
Definition: Kernel.cpp:167
float float32_t
Definition: common.h:59
EOptimizationType opt_type
CFeatures * rhs
feature vectors to occur on right hand side
static int64_t nchoosek(int32_t n, int32_t k)
Definition: Math.h:1009
#define SG_UNREF(x)
Definition: SGObject.h:53
bool set_position_weights_lhs(float64_t *pws, int32_t len, int32_t num)
void add_vector(bool **param, index_t *length, const char *name, const char *description="")
Definition: Parameter.cpp:335
#define SG_DEBUG(...)
Definition: SGIO.h:106
char * compute_consensus(int32_t &num_feat, int32_t num_suppvec, int32_t *IDX, float64_t *alphas)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
T sum(const Container< T > &a, bool no_diag=false)
int machine_int_t
Definition: common.h:69
CFeatures * lhs
feature vectors to occur on left hand side
The class Features is the base class of all feature objects.
Definition: Features.h:69
static float64_t log(float64_t v)
Definition: Math.h:714
virtual int32_t get_max_vector_length()
virtual void remove_lhs()
Definition: Kernel.cpp:655
A generic Support Vector Machine Interface.
Definition: SVM.h:49
CKernelNormalizer * normalizer
bool set_position_weights_rhs(float64_t *pws, int32_t len, int32_t num)
#define SG_WARNING(...)
Definition: SGIO.h:127
The Weighted Degree Position String kernel (Weighted Degree kernel with shifts).
#define SG_ADD(...)
Definition: SGObject.h:93
friend class CSqrtDiagKernelNormalizer
void POIMs_add_SLR(float64_t *const *const poims, const int32_t K, const int32_t debug)
float64_t * extract_w(int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *w_result, int32_t num_suppvec, int32_t *IDX, float64_t *alphas)
virtual float64_t normalize_lhs(float64_t value, int32_t idx_lhs)=0
static T min(T a, T b)
Definition: Math.h:138
void set_position_weights(float64_t *p_position_weights)
Definition: Trie.h:485
void add_matrix(bool **param, index_t *length_y, index_t *length_x, const char *name, const char *description="")
Definition: Parameter.cpp:944
Template class StringKernel, is the base class of all String Kernels.
Definition: StringKernel.h:26
bool have_same_length(int32_t len=-1)
static int32_t pow(bool x, int32_t n)
Definition: Math.h:474
index_t vlen
Definition: SGVector.h:571
float64_t compute_without_mismatch(char *avec, int32_t alen, char *bvec, int32_t blen)
float64_t compute_without_mismatch_position_weights(char *avec, float64_t *posweights_lhs, int32_t alen, char *bvec, float64_t *posweights_rhs, int32_t blen)

SHOGUN Machine Learning Toolbox - Documentation