SHOGUN  6.1.3
CommUlongStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #include <shogun/base/progress.h>
13 #include <shogun/io/SGIO.h>
15 #include <shogun/lib/common.h>
16 
18 
19 using namespace shogun;
20 
22 : CStringKernel<uint64_t>(size), use_sign(us)
23 {
25  clear_normal();
26 
28 }
29 
32  int32_t size)
33 : CStringKernel<uint64_t>(size), use_sign(us)
34 {
36  clear_normal();
38  init(l,r);
39 }
40 
42 {
43  cleanup();
44 }
45 
47 {
49 
50 #ifdef SVMLIGHT
51  if (lhs)
52  cache_reset();
53 #endif
54 
55  lhs = NULL ;
56  rhs = NULL ;
57 }
58 
60 {
61 #ifdef SVMLIGHT
62  if (rhs)
63  cache_reset();
64 #endif
65 
66  rhs = lhs;
67 }
68 
69 bool CCommUlongStringKernel::init(CFeatures* l, CFeatures* r)
70 {
72  return init_normalizer();
73 }
74 
76 {
78  clear_normal();
80 }
81 
82 float64_t CCommUlongStringKernel::compute(int32_t idx_a, int32_t idx_b)
83 {
84  int32_t alen, blen;
85  bool free_avec, free_bvec;
86  uint64_t* avec=((CStringFeatures<uint64_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
87  uint64_t* bvec=((CStringFeatures<uint64_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
88 
89  float64_t result=0;
90 
91  int32_t left_idx=0;
92  int32_t right_idx=0;
93 
94  if (use_sign)
95  {
96  while (left_idx < alen && right_idx < blen)
97  {
98  if (avec[left_idx]==bvec[right_idx])
99  {
100  uint64_t sym=avec[left_idx];
101 
102  while (left_idx< alen && avec[left_idx]==sym)
103  left_idx++;
104 
105  while (right_idx< blen && bvec[right_idx]==sym)
106  right_idx++;
107 
108  result++;
109  }
110  else if (avec[left_idx]<bvec[right_idx])
111  left_idx++;
112  else
113  right_idx++;
114  }
115  }
116  else
117  {
118  while (left_idx < alen && right_idx < blen)
119  {
120  if (avec[left_idx]==bvec[right_idx])
121  {
122  int32_t old_left_idx=left_idx;
123  int32_t old_right_idx=right_idx;
124 
125  uint64_t sym=avec[left_idx];
126 
127  while (left_idx< alen && avec[left_idx]==sym)
128  left_idx++;
129 
130  while (right_idx< blen && bvec[right_idx]==sym)
131  right_idx++;
132 
133  result+=((float64_t) (left_idx-old_left_idx)) * ((float64_t) (right_idx-old_right_idx));
134  }
135  else if (avec[left_idx]<bvec[right_idx])
136  left_idx++;
137  else
138  right_idx++;
139  }
140  }
141  ((CStringFeatures<uint64_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
142  ((CStringFeatures<uint64_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
143 
144  return result;
145 }
146 
148 {
149  int32_t t=0;
150  int32_t j=0;
151  int32_t k=0;
152  int32_t last_j=0;
153  int32_t len=-1;
154  bool free_vec;
155  uint64_t* vec=((CStringFeatures<uint64_t>*) lhs)->get_feature_vector(vec_idx, len, free_vec);
156 
157  if (vec && len>0)
158  {
159  int32_t max_len = len+dictionary.vlen;
160  SGVector<uint64_t> dic(max_len);
161  SGVector<float64_t> dic_weights(max_len);
162 
163  if (use_sign)
164  {
165  for (j=1; j<len; j++)
166  {
167  if (vec[j]==vec[j-1])
168  continue;
169 
170  merge_dictionaries(t, j, k, vec, dic, dic_weights, weight, vec_idx);
171  }
172 
173  merge_dictionaries(t, j, k, vec, dic, dic_weights, weight, vec_idx);
174 
175  while (k<dictionary.vlen)
176  {
177  dic[t]=dictionary[k];
178  dic_weights[t]=dictionary_weights[k];
179  t++;
180  k++;
181  }
182  }
183  else
184  {
185  for (j=1; j<len; j++)
186  {
187  if (vec[j]==vec[j-1])
188  continue;
189 
190  merge_dictionaries(t, j, k, vec, dic, dic_weights, weight*(j-last_j), vec_idx);
191  last_j = j;
192  }
193 
194  merge_dictionaries(t, j, k, vec, dic, dic_weights, weight*(j-last_j), vec_idx);
195 
196  while (k<dictionary.vlen)
197  {
198  dic[t]=dictionary[k];
199  dic_weights[t]=dictionary_weights[k];
200  t++;
201  k++;
202  }
203  }
204 
205  dic.resize_vector(t);
206  dic_weights.resize_vector(t);
207  dictionary = dic;
208  dictionary_weights = dic_weights;
209  }
210  ((CStringFeatures<uint64_t>*) lhs)->free_feature_vector(vec, vec_idx, free_vec);
211 
212  set_is_initialized(true);
213 }
214 
216 {
219  set_is_initialized(false);
220 }
221 
223  int32_t count, int32_t *IDX, float64_t * weights)
224 {
225  clear_normal();
226 
227  if (count<=0)
228  {
229  set_is_initialized(true);
230  SG_DEBUG("empty set of SVs\n")
231  return true;
232  }
233 
234  SG_DEBUG("initializing CCommUlongStringKernel optimization\n")
235 
236  for (auto i : progress(range(0, count), *this->io))
237  {
238  add_to_normal(IDX[i], weights[i]);
239  }
240 
241  SG_DEBUG("Done. \n")
242 
243  set_is_initialized(true);
244  return true;
245 }
246 
248 {
249  SG_DEBUG("deleting CCommUlongStringKernel optimization\n")
250  clear_normal();
251  return true;
252 }
253 
254 // binary search for each feature. trick: as features are sorted save last found idx in old_idx and
255 // only search in the remainder of the dictionary
257 {
258  float64_t result = 0;
259  int32_t j, last_j=0;
260  int32_t old_idx = 0;
261 
262  if (!get_is_initialized())
263  {
264  SG_ERROR("CCommUlongStringKernel optimization not initialized\n")
265  return 0 ;
266  }
267 
268 
269 
270  int32_t alen = -1;
271  bool free_avec;
272  uint64_t* avec=((CStringFeatures<uint64_t>*) rhs)->
273  get_feature_vector(i, alen, free_avec);
274 
275  if (avec && alen>0)
276  {
277  if (use_sign)
278  {
279  for (j=1; j<alen; j++)
280  {
281  if (avec[j]==avec[j-1])
282  continue;
283 
284  int32_t idx = CMath::binary_search_max_lower_equal(&(dictionary[old_idx]), dictionary.vlen-old_idx, avec[j-1]);
285 
286  if (idx!=-1)
287  {
288  if (dictionary[idx+old_idx] == avec[j-1])
289  result += dictionary_weights[idx+old_idx];
290 
291  old_idx+=idx;
292  }
293  }
294 
295  int32_t idx = CMath::binary_search(&(dictionary[old_idx]), dictionary.vlen-old_idx, avec[alen-1]);
296  if (idx!=-1)
297  result += dictionary_weights[idx+old_idx];
298  }
299  else
300  {
301  for (j=1; j<alen; j++)
302  {
303  if (avec[j]==avec[j-1])
304  continue;
305 
306  int32_t idx = CMath::binary_search_max_lower_equal(&(dictionary[old_idx]), dictionary.vlen-old_idx, avec[j-1]);
307 
308  if (idx!=-1)
309  {
310  if (dictionary[idx+old_idx] == avec[j-1])
311  result += dictionary_weights[idx+old_idx]*(j-last_j);
312 
313  old_idx+=idx;
314  }
315 
316  last_j = j;
317  }
318 
319  int32_t idx = CMath::binary_search(&(dictionary[old_idx]), dictionary.vlen-old_idx, avec[alen-1]);
320  if (idx!=-1)
321  result += dictionary_weights[idx+old_idx]*(alen-last_j);
322  }
323  }
324 
325  ((CStringFeatures<uint64_t>*) rhs)->free_feature_vector(avec, i, free_avec);
326 
327  return normalizer->normalize_rhs(result, i);
328 }
virtual void cleanup()
Definition: Kernel.cpp:172
PRange< T > progress(Range< T > range, const SGIO &io, std::string prefix="PROGRESS: ", SG_PRG_MODE mode=UTF8, std::function< bool()> condition=[](){return true;})
Definition: progress.h:712
virtual bool set_normalizer(CKernelNormalizer *normalizer)
Definition: Kernel.cpp:149
virtual float64_t compute_optimized(int32_t idx)
static int32_t binary_search(T *output, int32_t size, T elem)
Definition: Math.h:1629
virtual float64_t normalize_rhs(float64_t value, int32_t idx_rhs)=0
static int32_t binary_search_max_lower_equal(T *output, int32_t size, T elem)
Definition: Math.h:1697
#define SG_ERROR(...)
Definition: SGIO.h:128
void set_is_initialized(bool p_init)
Range< T > range(T rend)
Definition: range.h:136
virtual bool init(CFeatures *l, CFeatures *r)
bool get_is_initialized()
CCommUlongStringKernel(int32_t size=10, bool use_sign=false)
float64_t compute(int32_t idx_a, int32_t idx_b)
Template class StringFeatures implements a list of strings.
double float64_t
Definition: common.h:60
virtual void add_to_normal(int32_t idx, float64_t weight)
virtual bool init_optimization(int32_t count, int32_t *IDX, float64_t *weights)
virtual bool init_normalizer()
Definition: Kernel.cpp:167
CFeatures * rhs
feature vectors to occur on right hand side
#define SG_DEBUG(...)
Definition: SGIO.h:106
SGVector< float64_t > dictionary_weights
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
CFeatures * lhs
feature vectors to occur on left hand side
The class Features is the base class of all feature objects.
Definition: Features.h:69
CKernelNormalizer * normalizer
void resize_vector(int32_t n)
Definition: SGVector.cpp:301
friend class CSqrtDiagKernelNormalizer
void merge_dictionaries(int32_t &t, int32_t j, int32_t &k, uint64_t *vec, SGVector< uint64_t > dic, SGVector< float64_t > dic_weights, float64_t weight, int32_t vec_idx)
Template class StringKernel, is the base class of all String Kernels.
Definition: StringKernel.h:26
index_t vlen
Definition: SGVector.h:571

SHOGUN Machine Learning Toolbox - Documentation