ceras
yet another deep learning engine
optimizer.hpp
Go to the documentation of this file.
1 #ifndef XNRPSJMCYFXBDGNRJAWDNDIYQNGNXMRVLEHGNQWILKMTHGNOVHODLLXCCNIMUUFQSMOIYHDUD
2 #define XNRPSJMCYFXBDGNRJAWDNDIYQNGNXMRVLEHGNQWILKMTHGNOVHODLLXCCNIMUUFQSMOIYHDUD
3 
4 #include "./config.hpp"
5 #include "./operation.hpp"
6 #include "./place_holder.hpp"
7 #include "./variable.hpp"
8 #include "./session.hpp"
9 #include "./utils/color.hpp"
10 #include "./utils/debug.hpp"
11 #include "./utils/id.hpp"
12 #include "./utils/enable_shared.hpp"
13 
14 namespace ceras
15 {
16 
17  // sgd:
18  // - loss:
19  // - batch_size:
20  // - learning_rate:
21  // - momentum:
22  // - decay: should be very small, such as 1.0e-8
23  // - nesterov:
24  //
25  template< typename Loss, typename T >
26  struct sgd : enable_id<sgd<Loss, T>, "sgd optimizer">, enable_shared<sgd<Loss, T>>
27  {
29 
30  Loss& loss_;
33  T decay_;
34  bool nesterov_;
35  unsigned long iterations_;
36 
37  sgd(Loss& loss, std::size_t batch_size, T learning_rate=1.0e-1, T momentum=0.0, T decay=0.0, bool nesterov=false) noexcept :
38  loss_{loss}, learning_rate_(learning_rate), momentum_(std::max(T{0}, momentum)), decay_{std::max(T{0}, decay)}, nesterov_{nesterov}, iterations_{0}
39  {
40  better_assert( batch_size >= 1, "batch_size must be positive, but got: ", batch_size );
41  learning_rate_ /= static_cast<T>( batch_size );
42  }
43 
44  void forward()
45  {
46  loss_.backward( ones<T>( {1, } ) );
47  learning_rate_ /= ( 1.0 + decay_ * iterations_ );
48  auto& ss = get_default_session<tensor_type>();
49  for ( auto [id, v] : ss.variables_ )
50  {
51  if (v.trainable_)
52  {
53  auto& data = v.data();
54  auto& gradient = v.gradient();
55  auto& contexts = v.contexts();
56  if ( contexts.empty() ) // create context
57  contexts.push_back( zeros_like( data ) );
58  auto& moments = contexts[0];
59  for_each( moments.begin(), moments.end(), gradient.begin(), [this]( T& m, T g ) { m *= (*this).momentum_; m -= (*this).learning_rate_ * g;} );
60  if (!nesterov_ ) for_each( moments.begin(), moments.end(), data.begin(), gradient.begin(), [this]( T m, T& v, T g ) { v += (*this).momentum_ * m - (*this).learning_rate_ * g; } );
61  else data += moments;
62 
63  gradient.reset(); // clear variable gradient
64  }
65  }
66  ++iterations_;
67  }//sgd::forward
68  };//sgd
69 
70  template< typename Loss, typename T >
71  struct adagrad : enable_id<adagrad<Loss, T >, "adagrad optimizer">, enable_shared<adagrad<Loss,T>>
72  {
74 
75  Loss& loss_;
77  T decay_;
78  unsigned long iterations_;
79 
80  adagrad(Loss& loss, std::size_t batch_size, T learning_rate=1.0e-1, T decay=0.0) noexcept :
81  loss_(loss), learning_rate_(learning_rate), decay_{std::max(T{0}, decay)}, iterations_{0}
82  {
83  better_assert( batch_size >= 1, "batch_size must be positive, but got: ", batch_size );
84  learning_rate_ /= static_cast<T>( batch_size );
85  }
86 
87  void forward()
88  {
89  loss_.backward( ones<T>( {1, } ) );
90 
91  learning_rate_ /= ( 1.0 + decay_ * iterations_ );
92 
93  auto& ss = get_default_session<tensor_type>();//.get();
94  for ( auto [id, v] : ss.variables_ )
95  {
96  if (v.trainable_)
97  {
98  auto& data = v.data();
99  auto& gradient = v.gradient();
100  auto& contexts = v.contexts();
101  if ( contexts.empty() ) // create context
102  contexts.push_back( zeros_like( data ) );
103  //contexts.push_back( std::make_shared<tensor_type>( zeros_like( data ) ) );
104  auto& moments = contexts[0];
105 
106  for_each( moments.begin(), moments.end(), gradient.begin(), []( T& m, T g ) { m += g*g; } );
107 
108  for_each( data.begin(), data.end(), gradient.begin(), moments.begin(), [this]( T& d, T g, T m ) { d -= (*this).learning_rate_ * g / (eps + std::sqrt(m)); } );
109 
110  gradient.reset(); // clear variable gradient
111  }
112  }
113  ++iterations_;
114  }//forward
115  };//adagrad
116 
117  template< typename Loss, typename T >
119 
120  template< typename Loss, typename T >
121  struct rmsprop : enable_id< rmsprop< Loss, T >, "rmsprop optimizer" >, enable_shared<rmsprop<Loss, T>>
122  {
124 
125  Loss& loss_;
127  T rho_;
129  unsigned long iterations_;
130 
131  rmsprop(Loss& loss, std::size_t batch_size, T learning_rate=1.0e-1, T rho=0.9, T decay=0.0) noexcept :
132  loss_(loss), learning_rate_(learning_rate), rho_{rho}, decay_{std::max(T{0}, decay)}, iterations_{0}
133  {
134  better_assert( batch_size >= 1, "batch_size must be positive, but got: ", batch_size );
135  learning_rate_ /= static_cast<T>( batch_size );
136  }
137 
138  void forward()
139  {
140  loss_.backward( ones<T>( {1, } ) );
141 
142  learning_rate_ /= ( 1.0 + decay_ * iterations_ );
143 
144  auto& ss = get_default_session<tensor_type>();//.get();
145  for ( auto [id, v] : ss.variables_ )
146  {
147  if (v.trainable_)
148  {
149  auto& data = v.data();
150  auto& gradient = v.gradient();
151  auto& contexts = v.contexts();
152  if ( contexts.empty() ) // create context
153  contexts.push_back( zeros_like( data ) );
154  //contexts.push_back( std::make_shared<tensor_type>( zeros_like( data ) ) );
155  auto& moments = contexts[0];
156 
157  if ( iterations_ == 0 )
158  for_each( moments.begin(), moments.end(), gradient.begin(), [this]( T& m, T g ) { m = g*g; } );
159  else
160  for_each( moments.begin(), moments.end(), gradient.begin(), [this]( T& m, T g ) { m *= (*this).rho_; m += g*g*(1.0-(*this).rho_); } );
161 
162  for_each( data.begin(), data.end(), gradient.begin(), moments.begin(), [this]( T& d, T g, T m ) { d -= (*this).learning_rate_ * g / (eps + std::sqrt(m)); } );
163 
164  gradient.reset(); // clear variable gradient
165  }
166  }
167  ++iterations_;
168  }//forward
169  };//rmsprop
170 
171  template< typename Loss, typename T >
173 
174  template< typename Loss, typename T >
175  struct adadelta : enable_id< adadelta< Loss, T >, "adadelta optimizer" >, enable_shared<adadelta<Loss, T>>
176  {
178 
179  Loss& loss_;
180  T rho_;
182  unsigned long iterations_;
183 
184  adadelta(Loss& loss, std::size_t batch_size, T rho=0.9) noexcept : loss_(loss), rho_{rho}, iterations_{0}
185  {
186  better_assert( batch_size >= 1, "batch_size must be positive, but got: ", batch_size );
187  learning_rate_ = T{1} / static_cast<T>( batch_size );
188  }
189 
190  void forward()
191  {
192  loss_.backward( ones<T>( {1, } ) );
193 
194  auto& ss = get_default_session<tensor_type>();//.get();
195  for ( auto [id, v] : ss.variables_ )
196  {
197  if (v.trainable_)
198  {
199  auto& data = v.data();
200  auto& gradient = v.gradient();
201  auto& contexts = v.contexts();
202  if ( contexts.empty() ) // create context
203  {
204  //contexts.push_back( std::make_shared<tensor_type>( zeros_like( data ) ) );
205  //contexts.push_back( std::make_shared<tensor_type>( zeros_like( data ) ) );
206  contexts.push_back( zeros_like( data ) );
207  contexts.push_back( zeros_like( data ) );
208  }
209  auto& moments = contexts[0];
210  auto& delta = contexts[0];
211 
212  /*
213  if (iterations_==0)
214  {
215  for_each( moments.begin(), moments.end(), gradient.begin(), []( T& m, T g ) { m += g*g; } );
216  for_each( delta.begin(), delta.end(), gradient.begin(), []( T& d, T g ) { d += g*g; } );
217  }
218  else
219  {
220  // m = rho * m + (1-rho) * g * g;
221  for_each( moments.begin(), moments.end(), gradient.begin(), [this]( T& m, T g ) { m *= (*this).rho_; m += g*g*(1.0-(*this).rho_); } );
222  }
223  */
224 
225  for_each( moments.begin(), moments.end(), gradient.begin(), [this]( T& m, T g ) { m *= (*this).rho_; m += g*g*(1.0-(*this).rho_); } );
226 
227  // g_ = \sqrt{ (delta+eps) / (m+eps) }
228  for_each( gradient.begin(), gradient.end(), delta.begin(), moments.begin(), [this]( T& g, T d, T m ){ g *= (*this).learning_rate_ * std::sqrt((d+eps)/(m+eps));} );
229  // x = x - g_
230  data -= gradient;
231  // delta = rho * delta + (1-rho) * g_ * g_
232  /*
233  if (iterations_!=0)
234  */
235  for_each( delta.begin(), delta.end(), gradient.begin(), [this]( T& d, T g ) { d *= (*this).rho_; d += (1.0-(*this).rho_) * g * g; } );
236 
237  gradient.reset(); // clear variable gradient
238  }
239  }
240  ++iterations_;
241  }//forward
242  };//adadelta
243 
244  template< typename Loss, typename T >
246 
247  template< typename Loss, typename T >
248  struct adam : enable_id< adam< Loss, T >, "adam optimizer" >, enable_shared<adam<Loss, T>>
249  {
251 
252  Loss& loss_;
256  bool amsgrad_;
257  unsigned long iterations_;
258 
259  adam(Loss& loss, std::size_t batch_size, T learning_rate=1.0e-1, T beta_1=0.9, T beta_2=0.999, bool amsgrad=false) noexcept :
260  loss_{loss}, learning_rate_{learning_rate}, beta_1_{beta_1}, beta_2_{beta_2}, amsgrad_{ amsgrad }, iterations_{0}
261  {
262  better_assert( batch_size >= 1, "batch_size must be positive, but got: ", batch_size );
263  learning_rate_ /= static_cast<T>( batch_size );
264  }
265 
266  void forward()
267  {
268  loss_.backward( ones<T>( {1, } ) );
269  auto& ss = get_default_session<tensor_type>();//.get();
270  for ( auto [id, v] : ss.variables_ )
271  {
272  if (v.trainable_)
273  {
274  auto& data = v.data();
275  auto& gradient = v.gradient();
276  auto& contexts = v.contexts();
277  if ( contexts.empty() ) // create context
278  {
279  //contexts.push_back( std::make_shared<tensor_type>( zeros_like( data ) ) );
280  //contexts.push_back( std::make_shared<tensor_type>( zeros_like( data ) ) );
281  contexts.push_back( zeros_like( data ) );
282  contexts.push_back( zeros_like( data ) );
283  }
284  auto& m = contexts[0];
285  auto& v = contexts[1];
286 
287  T const b_beta_1 = beta_1_;
288  T const b_beta_2 = beta_2_;
289 
290  for_each( m.begin(), m.end(), gradient.begin(), [b_beta_1](T& m_, T g_){ m_ *= b_beta_1; m_ += g_*(1.0-b_beta_1); } );
291 
292  for_each( v.begin(), v.end(), gradient.begin(), [b_beta_2](T& v_, T g_){ v_ *= b_beta_2; v_ += g_* g_*(1.0-b_beta_2); } );
293 
294  T lr = learning_rate_ * std::sqrt( 1.0 - std::pow(beta_2_, iterations_+1) ) / ( 1.0 - std::pow(beta_1_, iterations_+1) );
295 
296  if ( iterations_ > 1 )
297  for_each( data.begin(), data.end(), m.begin(), v.begin(), [lr]( T& d_, T m_, T v_ ){ d_ -= lr * m_ / (eps+std::sqrt(v_)); } );
298  else
299  for_each( data.begin(), data.end(), gradient.begin(), [this]( T& d_, T g_ ){ d_ -= (*this).learning_rate_ * g_; } );
300 
301  gradient.reset(); // clear variable gradient
302  // TODO: enabling amsgrad
303  }
304  }//loop of variables
305  ++iterations_;
306  }//adam::forward
307  };// adam
308 
309 
310 
311  // Example usage:
312  //
313  // //session ss;
314  // auto& ss = get_default_session<tensor<float>>();
315  // auto loss = ...;
316  // auto optimizer = gradient{ loss, 1.0e-3f };
317  // for i = 1 : 1000
318  // ss.run( loss, batch_size )
319  // ss.run( optimizer )
320  //
321  template< typename Loss, typename T >
322  struct gradient_descent : enable_id< gradient_descent< Loss, T >, "gradient_descent optimizer" >, enable_shared<gradient_descent<Loss, T>>
323  {
325  Loss& loss_;
328 
329  gradient_descent(Loss& loss, std::size_t batch_size, T learning_rate=1.0e-3, T momentum=0.0) noexcept : loss_(loss), learning_rate_(learning_rate), momentum_(momentum)
330  {
331  learning_rate_ /= static_cast<T>( batch_size ); // fix for batch size
332  }
333 
334  void forward()
335  {
336  // update the gradient in the loss
337  loss_.backward( ones<T>( {1, } ) );
338  //update variables
339  auto& ss = get_default_session<tensor_type>();//.get();
340  for ( auto& [id, v] : ss.variables_ )
341  {
342  if (v.trainable_)
343  {
344  //v.data() -= learning_rate_ * (v.gradient());
345  //
346  auto& gradient = v.gradient();
347  better_assert( !has_nan(gradient), "gradient_descent error, tensor with id ", id, " has a nan value." );
348  v.data() -= learning_rate_ * gradient;
349 
350  gradient.reset(); // clear variable gradient
351  }
352  }
353  }
354 
355  };
356 
357  // TODO: adamax, nadam, ftrl
358 
359 
360 
361  //
362  // optimizers interfaces
363  //
364 
365  inline auto Adam = []( auto ... args )
366  {
367  return [=]<Expression Ex>( Ex& loss )
368  {
369  return adam{loss, args...};
370  };
371  };
372 
373  inline auto SGD = []( auto ... args )
374  {
375  return [=]<Expression Ex>( Ex& loss )
376  {
377  return sgd{loss, args...};
378  };
379  };
380 
381  inline auto Adagrad = []( auto ... args )
382  {
383  return [=]<Expression Ex>( Ex& loss )
384  {
385  return adagrad{loss, args...};
386  };
387  };
388 
389  inline auto RMSprop = []( auto ... args )
390  {
391  return [=]<Expression Ex>( Ex& loss )
392  {
393  return rmsprop{loss, args...};
394  };
395  };
396 
397  inline auto Adadelta = []( auto ... args )
398  {
399  return [=]<Expression Ex>( Ex& loss )
400  {
401  return adadelta{loss, args...};
402  };
403  };
404 
405 
406 }//namespace ceras
407 
408 #endif//XNRPSJMCYFXBDGNRJAWDNDIYQNGNXMRVLEHGNQWILKMTHGNOVHODLLXCCNIMUUFQSMOIYHDUD
409 
Definition: activation.hpp:12
auto SGD
Definition: optimizer.hpp:373
auto RMSprop
Definition: optimizer.hpp:389
auto Adam
Definition: optimizer.hpp:365
bool has_nan(Tsor const &tsor)
Definition: tensor.hpp:1095
auto Adadelta
Definition: optimizer.hpp:397
concept Expression
A type that represents a unary operator, a binary operator, a variable, a place_holder,...
Definition: operation.hpp:169
auto max(Tsor const &tsor)
Definition: tensor.hpp:1008
constexpr Tsor zeros_like(Tsor const &tsor)
Definition: tensor.hpp:988
auto Adagrad
Definition: optimizer.hpp:381
constexpr auto sqrt(Ex const &ex) noexcept
Computes Sqrt of the given expression.
Definition: operation.hpp:3657
Definition: optimizer.hpp:176
tensor< T > tensor_type
Definition: optimizer.hpp:177
T rho_
Definition: optimizer.hpp:180
unsigned long iterations_
Definition: optimizer.hpp:182
adadelta(Loss &loss, std::size_t batch_size, T rho=0.9) noexcept
Definition: optimizer.hpp:184
Loss & loss_
Definition: optimizer.hpp:179
T learning_rate_
Definition: optimizer.hpp:181
void forward()
Definition: optimizer.hpp:190
Definition: optimizer.hpp:72
Loss & loss_
Definition: optimizer.hpp:75
adagrad(Loss &loss, std::size_t batch_size, T learning_rate=1.0e-1, T decay=0.0) noexcept
Definition: optimizer.hpp:80
void forward()
Definition: optimizer.hpp:87
T decay_
Definition: optimizer.hpp:77
tensor< T > tensor_type
Definition: optimizer.hpp:73
T learning_rate_
Definition: optimizer.hpp:76
unsigned long iterations_
Definition: optimizer.hpp:78
Definition: optimizer.hpp:249
Loss & loss_
Definition: optimizer.hpp:252
tensor< T > tensor_type
Definition: optimizer.hpp:250
T beta_1_
Definition: optimizer.hpp:254
bool amsgrad_
Definition: optimizer.hpp:256
void forward()
Definition: optimizer.hpp:266
adam(Loss &loss, std::size_t batch_size, T learning_rate=1.0e-1, T beta_1=0.9, T beta_2=0.999, bool amsgrad=false) noexcept
Definition: optimizer.hpp:259
T learning_rate_
Definition: optimizer.hpp:253
unsigned long iterations_
Definition: optimizer.hpp:257
T beta_2_
Definition: optimizer.hpp:255
Definition: optimizer.hpp:323
tensor< T > tensor_type
Definition: optimizer.hpp:324
gradient_descent(Loss &loss, std::size_t batch_size, T learning_rate=1.0e-3, T momentum=0.0) noexcept
Definition: optimizer.hpp:329
Loss & loss_
Definition: optimizer.hpp:325
T learning_rate_
Definition: optimizer.hpp:326
void forward()
Definition: optimizer.hpp:334
T momentum_
Definition: optimizer.hpp:327
Definition: optimizer.hpp:122
T decay_
Definition: optimizer.hpp:128
void forward()
Definition: optimizer.hpp:138
unsigned long iterations_
Definition: optimizer.hpp:129
Loss & loss_
Definition: optimizer.hpp:125
tensor< T > tensor_type
Definition: optimizer.hpp:123
T rho_
Definition: optimizer.hpp:127
rmsprop(Loss &loss, std::size_t batch_size, T learning_rate=1.0e-1, T rho=0.9, T decay=0.0) noexcept
Definition: optimizer.hpp:131
T learning_rate_
Definition: optimizer.hpp:126
Definition: optimizer.hpp:27
Loss & loss_
Definition: optimizer.hpp:30
tensor< T > tensor_type
Definition: optimizer.hpp:28
T momentum_
Definition: optimizer.hpp:32
sgd(Loss &loss, std::size_t batch_size, T learning_rate=1.0e-1, T momentum=0.0, T decay=0.0, bool nesterov=false) noexcept
Definition: optimizer.hpp:37
void forward()
Definition: optimizer.hpp:44
T learning_rate_
Definition: optimizer.hpp:31
T decay_
Definition: optimizer.hpp:33
bool nesterov_
Definition: optimizer.hpp:34
unsigned long iterations_
Definition: optimizer.hpp:35
Definition: tensor.hpp:32