1 #ifndef XNRPSJMCYFXBDGNRJAWDNDIYQNGNXMRVLEHGNQWILKMTHGNOVHODLLXCCNIMUUFQSMOIYHDUD
2 #define XNRPSJMCYFXBDGNRJAWDNDIYQNGNXMRVLEHGNQWILKMTHGNOVHODLLXCCNIMUUFQSMOIYHDUD
9 #include "./utils/color.hpp"
10 #include "./utils/debug.hpp"
11 #include "./utils/id.hpp"
12 #include "./utils/enable_shared.hpp"
25 template<
typename Loss,
typename T >
26 struct sgd : enable_id<sgd<Loss, T>, "sgd optimizer">, enable_shared<sgd<Loss, T>>
37 sgd(Loss& loss, std::size_t batch_size, T learning_rate=1.0e-1, T momentum=0.0, T decay=0.0,
bool nesterov=
false) noexcept :
40 better_assert( batch_size >= 1,
"batch_size must be positive, but got: ", batch_size );
46 loss_.backward( ones<T>( {1, } ) );
48 auto& ss = get_default_session<tensor_type>();
49 for (
auto [
id, v] : ss.variables_ )
53 auto& data = v.data();
54 auto& gradient = v.gradient();
55 auto& contexts = v.contexts();
56 if ( contexts.empty() )
58 auto& moments = contexts[0];
59 for_each( moments.begin(), moments.end(), gradient.begin(), [
this]( T& m, T g ) { m *= (*this).momentum_; m -= (*this).learning_rate_ * g;} );
60 if (!
nesterov_ ) for_each( moments.begin(), moments.end(), data.begin(), gradient.begin(), [
this]( T m, T& v, T g ) { v += (*this).momentum_ * m - (*this).learning_rate_ * g; } );
70 template<
typename Loss,
typename T >
71 struct adagrad : enable_id<adagrad<Loss, T >, "adagrad optimizer">, enable_shared<adagrad<Loss,T>>
80 adagrad(Loss& loss, std::size_t batch_size, T learning_rate=1.0e-1, T decay=0.0) noexcept :
83 better_assert( batch_size >= 1,
"batch_size must be positive, but got: ", batch_size );
89 loss_.backward( ones<T>( {1, } ) );
93 auto& ss = get_default_session<tensor_type>();
94 for (
auto [
id, v] : ss.variables_ )
98 auto& data = v.data();
99 auto& gradient = v.gradient();
100 auto& contexts = v.contexts();
101 if ( contexts.empty() )
104 auto& moments = contexts[0];
106 for_each( moments.begin(), moments.end(), gradient.begin(), []( T& m, T g ) { m += g*g; } );
108 for_each( data.begin(), data.end(), gradient.begin(), moments.begin(), [
this]( T& d, T g, T m ) { d -= (*this).learning_rate_ * g / (eps + std::sqrt(m)); } );
117 template<
typename Loss,
typename T >
120 template<
typename Loss,
typename T >
121 struct rmsprop : enable_id< rmsprop< Loss, T >, "rmsprop optimizer" >, enable_shared<rmsprop<Loss, T>>
131 rmsprop(Loss& loss, std::size_t batch_size, T learning_rate=1.0e-1, T rho=0.9, T decay=0.0) noexcept :
134 better_assert( batch_size >= 1,
"batch_size must be positive, but got: ", batch_size );
140 loss_.backward( ones<T>( {1, } ) );
144 auto& ss = get_default_session<tensor_type>();
145 for (
auto [
id, v] : ss.variables_ )
149 auto& data = v.data();
150 auto& gradient = v.gradient();
151 auto& contexts = v.contexts();
152 if ( contexts.empty() )
155 auto& moments = contexts[0];
158 for_each( moments.begin(), moments.end(), gradient.begin(), [
this]( T& m, T g ) { m = g*g; } );
160 for_each( moments.begin(), moments.end(), gradient.begin(), [
this]( T& m, T g ) { m *= (*this).rho_; m += g*g*(1.0-(*this).rho_); } );
162 for_each( data.begin(), data.end(), gradient.begin(), moments.begin(), [
this]( T& d, T g, T m ) { d -= (*this).learning_rate_ * g / (eps + std::sqrt(m)); } );
171 template<
typename Loss,
typename T >
174 template<
typename Loss,
typename T >
175 struct adadelta : enable_id< adadelta< Loss, T >, "adadelta optimizer" >, enable_shared<adadelta<Loss, T>>
186 better_assert( batch_size >= 1,
"batch_size must be positive, but got: ", batch_size );
192 loss_.backward( ones<T>( {1, } ) );
194 auto& ss = get_default_session<tensor_type>();
195 for (
auto [
id, v] : ss.variables_ )
199 auto& data = v.data();
200 auto& gradient = v.gradient();
201 auto& contexts = v.contexts();
202 if ( contexts.empty() )
209 auto& moments = contexts[0];
210 auto& delta = contexts[0];
225 for_each( moments.begin(), moments.end(), gradient.begin(), [
this]( T& m, T g ) { m *= (*this).rho_; m += g*g*(1.0-(*this).rho_); } );
228 for_each( gradient.begin(), gradient.end(), delta.begin(), moments.begin(), [
this]( T& g, T d, T m ){ g *= (*this).learning_rate_ * std::sqrt((d+eps)/(m+eps));} );
235 for_each( delta.begin(), delta.end(), gradient.begin(), [
this]( T& d, T g ) { d *= (*this).rho_; d += (1.0-(*this).rho_) * g * g; } );
244 template<
typename Loss,
typename T >
247 template<
typename Loss,
typename T >
248 struct adam : enable_id< adam< Loss, T >, "adam optimizer" >, enable_shared<adam<Loss, T>>
259 adam(Loss& loss, std::size_t batch_size, T learning_rate=1.0e-1, T beta_1=0.9, T beta_2=0.999,
bool amsgrad=
false) noexcept :
262 better_assert( batch_size >= 1,
"batch_size must be positive, but got: ", batch_size );
268 loss_.backward( ones<T>( {1, } ) );
269 auto& ss = get_default_session<tensor_type>();
270 for (
auto [
id, v] : ss.variables_ )
274 auto& data = v.data();
275 auto& gradient = v.gradient();
276 auto& contexts = v.contexts();
277 if ( contexts.empty() )
284 auto& m = contexts[0];
285 auto& v = contexts[1];
290 for_each( m.begin(), m.end(), gradient.begin(), [b_beta_1](T& m_, T g_){ m_ *= b_beta_1; m_ += g_*(1.0-b_beta_1); } );
292 for_each( v.begin(), v.end(), gradient.begin(), [b_beta_2](T& v_, T g_){ v_ *= b_beta_2; v_ += g_* g_*(1.0-b_beta_2); } );
297 for_each( data.begin(), data.end(), m.begin(), v.begin(), [lr]( T& d_, T m_, T v_ ){ d_ -= lr * m_ / (eps+std::sqrt(v_)); } );
299 for_each( data.begin(), data.end(), gradient.begin(), [
this]( T& d_, T g_ ){ d_ -= (*this).learning_rate_ * g_; } );
321 template<
typename Loss,
typename T >
322 struct gradient_descent : enable_id< gradient_descent< Loss, T >, "gradient_descent optimizer" >, enable_shared<gradient_descent<Loss, T>>
337 loss_.backward( ones<T>( {1, } ) );
339 auto& ss = get_default_session<tensor_type>();
340 for (
auto& [
id, v] : ss.variables_ )
346 auto& gradient = v.gradient();
347 better_assert( !
has_nan(gradient),
"gradient_descent error, tensor with id ",
id,
" has a nan value." );
365 inline auto Adam = [](
auto ... args )
369 return adam{loss, args...};
373 inline auto SGD = [](
auto ... args )
377 return sgd{loss, args...};
Definition: activation.hpp:12
auto SGD
Definition: optimizer.hpp:373
auto RMSprop
Definition: optimizer.hpp:389
auto Adam
Definition: optimizer.hpp:365
bool has_nan(Tsor const &tsor)
Definition: tensor.hpp:1095
auto Adadelta
Definition: optimizer.hpp:397
concept Expression
A type that represents a unary operator, a binary operator, a variable, a place_holder,...
Definition: operation.hpp:169
auto max(Tsor const &tsor)
Definition: tensor.hpp:1008
constexpr Tsor zeros_like(Tsor const &tsor)
Definition: tensor.hpp:988
auto Adagrad
Definition: optimizer.hpp:381
constexpr auto sqrt(Ex const &ex) noexcept
Computes Sqrt of the given expression.
Definition: operation.hpp:3657
Definition: optimizer.hpp:176
tensor< T > tensor_type
Definition: optimizer.hpp:177
T rho_
Definition: optimizer.hpp:180
unsigned long iterations_
Definition: optimizer.hpp:182
adadelta(Loss &loss, std::size_t batch_size, T rho=0.9) noexcept
Definition: optimizer.hpp:184
Loss & loss_
Definition: optimizer.hpp:179
T learning_rate_
Definition: optimizer.hpp:181
void forward()
Definition: optimizer.hpp:190
Definition: optimizer.hpp:72
Loss & loss_
Definition: optimizer.hpp:75
adagrad(Loss &loss, std::size_t batch_size, T learning_rate=1.0e-1, T decay=0.0) noexcept
Definition: optimizer.hpp:80
void forward()
Definition: optimizer.hpp:87
T decay_
Definition: optimizer.hpp:77
tensor< T > tensor_type
Definition: optimizer.hpp:73
T learning_rate_
Definition: optimizer.hpp:76
unsigned long iterations_
Definition: optimizer.hpp:78
Definition: optimizer.hpp:249
Loss & loss_
Definition: optimizer.hpp:252
tensor< T > tensor_type
Definition: optimizer.hpp:250
T beta_1_
Definition: optimizer.hpp:254
bool amsgrad_
Definition: optimizer.hpp:256
void forward()
Definition: optimizer.hpp:266
adam(Loss &loss, std::size_t batch_size, T learning_rate=1.0e-1, T beta_1=0.9, T beta_2=0.999, bool amsgrad=false) noexcept
Definition: optimizer.hpp:259
T learning_rate_
Definition: optimizer.hpp:253
unsigned long iterations_
Definition: optimizer.hpp:257
T beta_2_
Definition: optimizer.hpp:255
Definition: optimizer.hpp:323
tensor< T > tensor_type
Definition: optimizer.hpp:324
gradient_descent(Loss &loss, std::size_t batch_size, T learning_rate=1.0e-3, T momentum=0.0) noexcept
Definition: optimizer.hpp:329
Loss & loss_
Definition: optimizer.hpp:325
T learning_rate_
Definition: optimizer.hpp:326
void forward()
Definition: optimizer.hpp:334
T momentum_
Definition: optimizer.hpp:327
Definition: optimizer.hpp:122
T decay_
Definition: optimizer.hpp:128
void forward()
Definition: optimizer.hpp:138
unsigned long iterations_
Definition: optimizer.hpp:129
Loss & loss_
Definition: optimizer.hpp:125
tensor< T > tensor_type
Definition: optimizer.hpp:123
T rho_
Definition: optimizer.hpp:127
rmsprop(Loss &loss, std::size_t batch_size, T learning_rate=1.0e-1, T rho=0.9, T decay=0.0) noexcept
Definition: optimizer.hpp:131
T learning_rate_
Definition: optimizer.hpp:126
Definition: optimizer.hpp:27
Loss & loss_
Definition: optimizer.hpp:30
tensor< T > tensor_type
Definition: optimizer.hpp:28
T momentum_
Definition: optimizer.hpp:32
sgd(Loss &loss, std::size_t batch_size, T learning_rate=1.0e-1, T momentum=0.0, T decay=0.0, bool nesterov=false) noexcept
Definition: optimizer.hpp:37
void forward()
Definition: optimizer.hpp:44
T learning_rate_
Definition: optimizer.hpp:31
T decay_
Definition: optimizer.hpp:33
bool nesterov_
Definition: optimizer.hpp:34
unsigned long iterations_
Definition: optimizer.hpp:35
Definition: tensor.hpp:32