ceras
yet another deep learning engine
operation.hpp
Go to the documentation of this file.
1 #ifndef IPKVWSJOCMGGVRASCBLPYHFBCHRIVEXYBOMMDAKFAUDFYVYOOOISLRXJNUJKPJEVMLDPRDSNM
2 #define IPKVWSJOCMGGVRASCBLPYHFBCHRIVEXYBOMMDAKFAUDFYVYOOOISLRXJNUJKPJEVMLDPRDSNM
3 
4 #include "./includes.hpp"
5 #include "./place_holder.hpp"
6 #include "./variable.hpp"
7 #include "./constant.hpp"
8 #include "./value.hpp"
9 #include "./utils/range.hpp"
10 #include "./utils/debug.hpp"
11 #include "./config.hpp"
12 #include "./utils/context_cast.hpp"
13 #include "./utils/for_each.hpp"
14 #include "./utils/id.hpp"
15 #include "./utils/enable_shared.hpp"
16 
17 namespace ceras
18 {
19  template< typename Operator, typename Forward_Action, typename Backward_Action >
20  struct unary_operator : enable_id<unary_operator<Operator, Forward_Action, Backward_Action>, "Unary Operator">
21  {
23  Forward_Action forward_action_;
24  Backward_Action backward_action_;
25 
26  typedef decltype( std::declval<Forward_Action>()( std::declval<decltype(op_)>().forward() ) ) tensor_type;
27 
30 
31  unary_operator( Operator const& op, Forward_Action const& forward_action, Backward_Action const& backward_action ) noexcept :
32  op_{op}, forward_action_{ forward_action }, backward_action_{ backward_action } { }
33 
34  auto forward()// const
35  {
36  input_data_ = op_.forward();
38  return output_data_;
39  }
40 
41  void backward( tensor_type const& grad )
42  {
43  auto const& current_gradient = backward_action_( input_data_, output_data_, grad );
44  op_.backward( current_gradient );
45  }
46 
47  };
48 
49  static auto constexpr make_unary_operator = []( auto const& unary_forward_action, auto const& unary_backward_action, std::string const& name="Anonymous Unary Operator" ) noexcept
50  {
51  return [&unary_forward_action, &unary_backward_action, &name]( auto const& op ) noexcept
52  {
53  auto ans = unary_operator{ op, unary_forward_action, unary_backward_action };
54  ans.name_ = name;
55  return ans;
56  };
57  };
58 
59  template< typename Lhs_Operator, typename Rhs_Operator, typename Forward_Action, typename Backward_Action >
60  struct binary_operator :enable_id<binary_operator<Lhs_Operator, Rhs_Operator, Forward_Action, Backward_Action>, "Binary Operator">
61  {
62  Lhs_Operator lhs_op_;
63  Rhs_Operator rhs_op_;
64  Forward_Action forward_action_;
65  Backward_Action backward_action_; // backward action for binary operator produces a tuple of two tensors
66 
68 
72 
73  binary_operator( Lhs_Operator const& lhs_op, Rhs_Operator const& rhs_op, Forward_Action const& forward_action, Backward_Action const& backward_action ) noexcept :
74  lhs_op_{lhs_op}, rhs_op_{rhs_op}, forward_action_{ forward_action }, backward_action_{ backward_action } { }
75 
76  auto forward()
77  {
78  static_assert( !(is_value_v<Lhs_Operator> && is_value_v<Rhs_Operator>), "Not valid for two values" );
79 
80  if constexpr ( is_value_v<Lhs_Operator> )
81  {
82  rhs_input_data_ = rhs_op_.forward();
84  }
85  else if constexpr ( is_value_v<Rhs_Operator> )
86  {
87  lhs_input_data_ = lhs_op_.forward();
89  }
90  else
91  {
92  lhs_input_data_ = lhs_op_.forward();
93  rhs_input_data_ = rhs_op_.forward();
94  }
96  return output_data_;
97  }
98 
99  void backward( tensor_type const& grad )
100  {
101  auto const& [current_gradient_lhs, current_gradient_rhs] = backward_action_( lhs_input_data_, rhs_input_data_, output_data_, grad );
102  lhs_op_.backward( current_gradient_lhs );
103  rhs_op_.backward( current_gradient_rhs );
104  }
105 
106  };
107 
108  static auto constexpr make_binary_operator = []( auto const& binary_forward_action, auto const& binary_backward_action, std::string const& name="Anonymous Binary Operator" ) noexcept
109  {
110  return [&binary_forward_action, &binary_backward_action, &name]( auto const& lhs_op, auto const& rhs_op ) noexcept
111  {
112  auto ans = binary_operator{ lhs_op, rhs_op, binary_forward_action, binary_backward_action };
113  ans.name_ = name;
114  return ans;
115  };
116  };
117 
118  template< typename T >
119  struct is_unary_operator : std::false_type{};
120 
121  template< typename Operator, typename Forward_Action, typename Backward_Action >
122  struct is_unary_operator< unary_operator<Operator, Forward_Action, Backward_Action> > : std::true_type {};
123 
127  template< class T >
129 
134  template< typename T >
135  concept Unary_Operator = is_unary_operator_v<T>;
136 
137 
138  template< typename T >
139  struct is_binary_operator : std::false_type{};
140 
141  template< typename Lhs_Operator, typename Rhs_Operator, typename Forward_Action, typename Backward_Action >
142  struct is_binary_operator< binary_operator<Lhs_Operator, Rhs_Operator, Forward_Action, Backward_Action> > : std::true_type {};
143 
147  template< class T >
149 
154  template< typename T >
155  concept Binary_Operator = is_binary_operator_v<T>;
156 
161  template< typename T >
162  concept Operator = Unary_Operator<T> || Binary_Operator<T>;
163 
168  template< typename T >
169  concept Expression = Operator<T> || Variable<T> || Place_Holder<T> || Constant<T> || Value<T>;
170 
171 
177  template< Expression Ex >
178  inline std::string computation_graph( Ex const& ex ) noexcept
179  {
180  auto generate_node_and_label = []<Expression Expr>( Expr const& expr ) noexcept
181  {
182  std::string const id = std::to_string( expr.id() );
183  std::string const name = expr.name();
184  std::string node = std::string{"n"} + id;
185  std::string label = name + std::string{"<"} + id + std::string{">"};
186  return std::make_tuple( node, label );
187  };
188 
189  auto generate_dot = [&generate_node_and_label]<Expression Expr>( Expr const& expr, auto const& _generate_dot ) noexcept
190  {
191  auto const& [node, label] = generate_node_and_label( expr );
192  std::string const& expr_dot = node + std::string{" [label=\""} + label + std::string{"\"] ;\n"};
193 
194  if constexpr( is_unary_operator_v<Expr> )
195  {
196  auto const& [n_node, n_label] = generate_node_and_label( expr.op_ );
197  std::string const& arrow_relation = n_node + std::string{" -> "} + node + std::string{" ;\n"};
198  std::string const& op_dot = _generate_dot( expr.op_, _generate_dot );
199  return expr_dot + arrow_relation + op_dot;
200  }
201  else if constexpr( is_binary_operator_v<Expr> )
202  {
203  // for LHS operator
204  auto const& [n_lhs_node, n_lhs_label] = generate_node_and_label( expr.lhs_op_ );
205  std::string const& arrow_lhs_relation = n_lhs_node + std::string{" -> "} + node + std::string{" ;\n"};
206  std::string const& op_lhs_dot = _generate_dot( expr.lhs_op_, _generate_dot );
207 
208  // for RHS operator
209  auto const& [n_rhs_node, n_rhs_label] = generate_node_and_label( expr.rhs_op_ );
210  std::string const& arrow_rhs_relation = n_rhs_node + std::string{" -> "} + node + std::string{" ;\n"};
211  std::string const& op_rhs_dot = _generate_dot( expr.rhs_op_, _generate_dot );
212 
213  return expr_dot + arrow_lhs_relation + arrow_rhs_relation + op_lhs_dot + op_rhs_dot;
214  }
215  else if constexpr ( is_variable_v<Expr> )
216  {
217  std::vector<unsigned long> const& shape = expr.shape();
218  bool const training_state = expr.trainable();
219 
220  // shape
221  std::stringstream ss;
222  std::copy( shape.begin(), shape.end(), std::ostream_iterator<unsigned long>( ss, " " ) );
223  std::string const& str_shape = ss.str() + (training_state ? std::string{"), trainable"} : std::string{"), non-trainable"});
224  // trainable state
225  std::string const& new_label = label + std::string{"[("} + str_shape + std::string{"]"};
226 
227  if (!training_state)
228  return node + std::string{" [shape=box,label=\""} + new_label + std::string{"\"] ;\n"};
229 
230  return node + std::string{" [peripheries=3,style=filled,color=\".7 .3 1.0\",shape=box,label=\""} + new_label + std::string{"\"] ;\n"};
231  }
232  else
233  {
234  return expr_dot;
235  }
236  };
237 
238  std::string const& head = "\n\ndigraph g {\n";
239  std::string const& tail = "}\n\n";
240  return head + generate_dot( ex, generate_dot ) + tail;
241  }
242 
243 
244  namespace
245  {
246  // `plus_context` was nested in the `plus` function.
247  // But gcc compiler (11.1.0) has a lot of problems in deducing the context types (gcc may consume infnite memory and then get killed).
248  // Moving forward/backward algorithms here to make gcc happy, at a price of an extra indirection layer with redundant code.
249  struct plus_context
250  {
251  auto make_forward() const noexcept
252  {
253  return []<Tensor Tsor>( Tsor const& lhs_tensor, Tsor const& rhs_tensor ) noexcept
254  {
255  better_assert( !has_nan( lhs_tensor ), "forward propagation for operator plus: lhs_tensor contains Nan!" );
256  better_assert( !has_nan( rhs_tensor ), "forward propagation for operator plus: rhs_tensor contains Nan!" );
257  return add( lhs_tensor, rhs_tensor );
258  };
259  }
260 
261  auto const make_backward() const noexcept
262  {
263  return []<Tensor Tsor>( Tsor const& lhs_input, Tsor const& rhs_input, Tsor const&, Tsor const& grad ) noexcept
264  {
265  better_assert( !has_nan( grad ), "backprop: upcoming gradient for operator + contains NaN!" );
266 
267  auto const& grad_fun = [&grad]( auto const& input )
268  {
269  Tsor ans = grad.deep_copy();
270  while( input.ndim() < ans.ndim() )
271  ans = sum( ans, 0 );
272  auto const& shape = input.shape();
273  for ( auto axis : range( input.ndim() ) )
274  if ( shape[axis] == 1 )
275  ans = sum( ans, axis, true );
276  return ans;
277  };
278  return std::make_tuple( grad_fun( lhs_input), grad_fun( rhs_input ) );
279  };
280  }
281  }; // plus_context
282  }//anonymous namespace
283 
284  template< Expression Lhs_Expression, Expression Rhs_Expression >
285  auto constexpr plus( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
286  {
287  return make_binary_operator( plus_context{}.make_forward(), plus_context{}.make_backward(), "Plus")( lhs_ex, rhs_ex );
288  }
289 
290  template< Expression Lhs_Expression, Expression Rhs_Expression >
291  auto constexpr operator + ( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
292  {
293  return plus( lhs_ex, rhs_ex );
294  }
295 
296  template< Expression Ex >
297  auto constexpr operator + ( Ex const& ex ) noexcept
298  {
299  return ex;
300  }
301 
302  namespace
303  {
304  struct multiplication_context
305  {
306  auto make_forward() const noexcept
307  {
308  return []( std::shared_ptr<std::any> forward_cache ) noexcept
309  {
310  return [forward_cache]<Tensor Tsor>( Tsor const& lhs_tensor, Tsor const& rhs_tensor ) noexcept
311  {
312  Tsor& ans = context_cast<Tsor>( forward_cache );
313  multiply( lhs_tensor, rhs_tensor, ans );
314  return ans;
315  };
316  };
317  }
318  auto make_backward() const noexcept
319  {
320  return []( std::shared_ptr<std::any> backward_cache_lhs, std::shared_ptr<std::any> backward_cache_rhs ) noexcept
321  {
322  return [backward_cache_lhs, backward_cache_rhs]<Tensor Tsor>( Tsor const& lhs_input, Tsor const& rhs_input, Tsor const&, Tsor const& grad ) noexcept
323  {
324  // left branch <-- grad * rhs^T
325  auto const& g_shape = grad.shape();
326  auto const[m, n] = std::make_tuple( g_shape[0], g_shape[1] ); // 4, 1
327  auto const k = *(lhs_input.shape().rbegin()); // 13
328 
329  Tsor& lhs_grad = context_cast<Tsor>( backward_cache_lhs );
330  lhs_grad.resize( lhs_input.shape() );
331 
332  gemm( grad.data(), false, rhs_input.data(), true, m, n, k, lhs_grad.data() );
333 
334  // right branch <-- lhs^T * grad
335  Tsor& rhs_grad = context_cast<Tsor>( backward_cache_rhs );
336  rhs_grad.resize( rhs_input.shape() );
337  gemm( lhs_input.data(), true, grad.data(), false, k, m, n, rhs_grad.data() );
338 
339  return std::make_tuple( lhs_grad, rhs_grad );
340  };
341  };
342  }
343  };//multiplication_context
344  }//anonymous namespace
345 
346  template< Expression Lhs_Expression, Expression Rhs_Expression >
347  auto operator * ( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
348  {
349  // case of Value * Operator and Operator * Value
350  if constexpr( is_value_v<Lhs_Expression> || is_value_v<Rhs_Expression> )
351  {
352  return elementwise_product( lhs_ex, rhs_ex );
353  }
354  else
355  {
356  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
357  std::shared_ptr<std::any> backward_cache_lhs = std::make_shared<std::any>();
358  std::shared_ptr<std::any> backward_cache_rhs = std::make_shared<std::any>();
359  return make_binary_operator( multiplication_context{}.make_forward()(forward_cache), multiplication_context{}.make_backward()(backward_cache_lhs, backward_cache_rhs), "Multiply")( lhs_ex, rhs_ex );
360  }
361  }
362 
363 #if 0
364  template <Expression Ex>
365  auto constexpr log( Ex const& ex ) noexcept
366  {
367  return make_unary_operator( []<Tensor Tsor>( Tsor const& input ) noexcept
368  {
369  better_assert( !has_nan( input ), "forward propagation for operator log: input contains Nan!" );
370  auto ans = input.deep_copy();
371  ans.map( [](auto & x){ better_assert( x+eps > 0, "log forward propagation, found an invalid value ", x ); x = std::log(x+eps); } );
372  better_assert( !has_nan( ans ), "forward propagation for operator log: output contains Nan!" );
373  better_assert( !has_inf( ans ), "forward propagation for operator log: output contains Inf!" );
374  return ans;
375  },
376  []<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
377  {
378  better_assert( !has_nan( grad ), "input gradient for operator log contains NaN!" );
379  auto ans = elementwise_divide(grad, input); // TODO: error here
380  better_assert( !has_nan( ans ), "backprop: result for operator log contains NaN!" );
381  return ans;
382  },
383  "Log"
384  )( ex );
385  };
386 #endif
387 
388  template <Expression Ex>
389  auto constexpr negative( Ex const& ex ) noexcept
390  {
391  return make_unary_operator( []<Tensor Tsor>( Tsor const& tensor ) noexcept
392  {
393  better_assert( !has_nan( tensor ), "forward propagation for operator log: tensor contains Nan!" );
394  return -tensor;
395  },
396  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
397  {
398  better_assert( !has_nan( grad ), "input gradient for operator negative contains NaN!" );
399  return -grad;
400  },
401  "Negative"
402  )( ex );
403  };
404 
405  template <Expression Ex>
406  auto constexpr operator - ( Ex const& ex ) noexcept
407  {
408  return negative( ex );
409  }
410 
411  template< Expression Lhs_Expression, Expression Rhs_Expression >
412  auto constexpr elementwise_product( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
413  {
414  return make_binary_operator( []<Tensor Tsor>( Tsor const& lhs_tensor, Tsor const& rhs_tensor ) noexcept
415  {
416  return elementwise_product( lhs_tensor, rhs_tensor );
417  },
418  []<Tensor Tsor>( Tsor const& lhs_input, Tsor const& rhs_input, Tsor const&, Tsor const grad ) noexcept
419  {
420  auto const& grad_fun = [&grad]( auto const& input, auto const& other_input )
421  {
422  Tsor ans = elementwise_product( grad, other_input );
423  while( input.ndim() < ans.ndim() )
424  ans = sum( ans, 0 );
425  auto const& shape = input.shape();
426  for ( auto axis : range( input.ndim() ) )
427  if ( shape[axis] == 1 )
428  ans = sum( ans, axis, true );
429  return ans;
430  };
431  return std::make_tuple( grad_fun( lhs_input, rhs_input ), grad_fun( rhs_input, lhs_input ) );
432  },
433  "HadamardProduct"
434  )( lhs_ex, rhs_ex );
435  };
436 
437  template< Expression Lhs_Expression, Expression Rhs_Expression >
438  auto constexpr elementwise_multiply( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
439  {
440  return elementwise_product( lhs_ex, rhs_ex );
441  }
442 
443  template< Expression Lhs_Expression, Expression Rhs_Expression >
444  auto constexpr hadamard_product( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
445  {
446  return elementwise_product( lhs_ex, rhs_ex );
447  }
448 
449  template <Expression Ex>
450  auto constexpr sum_reduce( Ex const& ex ) noexcept
451  {
452  return make_unary_operator( []<Tensor Tsor>( Tsor const& tsor ) noexcept
453  {
454  better_assert( !has_nan( tsor ), "forward propagation for operator sum_reduce: tensor contains Nan!" );
455  return reduce_sum( tsor );
456  },
457  []<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
458  {
459  better_assert( !has_nan( grad ), "input gradient for operator sum_reduce contains NaN!" );
460  better_assert( grad.size() == 1, "sum_reduce should only output one value" );
461  Tsor ans = ones_like( input );
462  ans *= grad[0];
463  return ans;
464  },
465  "Sum"
466  )( ex );
467  }
468 
469  template <Expression Ex>
470  auto constexpr reduce_sum( Ex const& ex ) noexcept
471  {
472  return sum_reduce( ex );
473  }
474 
487  template <Expression Ex>
488  auto constexpr mean_reduce( Ex const& ex ) noexcept
489  {
490  return make_unary_operator( []<Tensor Tsor>( Tsor const& tsor ) noexcept
491  {
492  better_assert( !has_nan( tsor ), "forward propagation for operator mean: tensor contains Nan!" );
493  return reduce_mean( tsor );
494  },
495  []<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
496  {
497  better_assert( !has_nan( grad ), "input gradient for operator mean_reduce contains NaN!" );
498  better_assert( grad.size() == 1, "mean_reduce should only output one value" );
499  Tsor ans = ones_like( input );
500  ans *= grad[0];
501  unsigned long const batch_size = (input.shape().size() == 1) ? 1 : (*(input.shape().begin()));
502  ans /= static_cast<typename Tsor::value_type>(batch_size);
503  return ans;
504  },
505  "Mean"
506  )( ex );
507  }
508 
512  template <Expression Ex>
513  auto constexpr reduce_mean( Ex const& ex ) noexcept
514  {
515  return mean_reduce( ex );
516  }
517 
521  template <Expression Ex>
522  auto constexpr mean( Ex const& ex ) noexcept
523  {
524  return mean_reduce( ex );
525  }
526 
527 
528 
529 
530  template< Expression Lhs_Expression, Expression Rhs_Expression >
531  auto constexpr minus( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
532  {
533  if constexpr (is_value_v<Rhs_Expression>)
534  {
535  return negative( plus( negative(lhs_ex), rhs_ex ) );
536  }
537  else
538  {
539  return plus( lhs_ex, negative(rhs_ex) );
540  }
541  }
542 
543  template< Expression Lhs_Expression, Expression Rhs_Expression >
544  auto constexpr operator - ( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
545  {
546  return minus( lhs_ex, rhs_ex );
547  }
548 
549 
562  template <Expression Ex>
563  auto constexpr square( Ex const& ex ) noexcept
564  {
565  return make_unary_operator( []<Tensor Tsor>( Tsor const& tsor ) noexcept
566  {
567  better_assert( !has_nan( tsor ), "forward propagation for operator square: tensor contains Nan!" );
568  Tsor ans = tsor.deep_copy();
569  std::for_each( ans.data(), ans.data() + ans.size(), []( auto & v ){ v *= v; } );
570  return ans;
571  },
572  []<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
573  {
574  better_assert( !has_nan( grad ), "input gradient for operator square contains NaN!" );
575  Tsor ans = input.deep_copy();
576  ans *= grad;
577  ans *= typename Tsor::value_type{2};
578  return ans;
579  },
580  "Square"
581  )( ex );
582  }
583 
584 #if 0
597  template <Expression Ex>
598  auto constexpr sqrt( Ex const& ex ) noexcept
599  {
600  return make_unary_operator( []<Tensor Tsor>( Tsor const& tsor ) noexcept
601  {
602  Tsor ans = tsor.deep_copy(); // TODO: optimize out
603  std::for_each( ans.data(), ans.end(), []( auto & v ){ v = std::sqrt(v); } );
604  return ans;
605  },
606  []<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
607  {
608  Tsor ans = ones_like( input ); // TODO: optimize out
609  for_each( ans.begin(), ans.end(), grad.begin(), []( auto& v, auto g ){ v = 0.5 * g / (std::sqrt(v)+eps); } );
610  return ans;
611  },
612  "SquareRoot"
613  )( ex );
614  }
615 #endif
616 
631  template <Expression Ex, Expression Ey>
632  auto constexpr hypot( Ex const& ex, Ey const& ey ) noexcept
633  {
634  return sqrt( square(ex) + square(ey) );
635  }
636 
637 
638 
639 
640 
641 #if 0
654  template <Expression Ex>
655  auto constexpr abs( Ex const& ex ) noexcept
656  {
657  return make_unary_operator( []<Tensor Tsor>( Tsor const& tsor ) noexcept
658  {
659  better_assert( !has_nan( tsor ), "forward propagation for operator abs: tensor contains Nan!" );
660  Tsor ans = tsor.deep_copy();
661  std::for_each( ans.data(), ans.data() + ans.size(), []( typename Tsor::value_type & v ){ v = std::abs(v); } );
662  return ans;
663  },
664  []<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
665  {
666  better_assert( !has_nan( grad ), "input gradient for operator abs contains NaN!" );
667  Tsor ans = grad;
668  for ( auto idx : range( ans.size() ) )
669  ans[idx] = (input[idx]>typename Tsor::value_type{0}) ? ans[idx] : -ans[idx];
670  return ans;
671  },
672  "Abs"
673  )( ex );
674  }//;
675 #endif
676 
677 #if 0
678  template <Expression Ex>
679  [[deprecated("GCC might die here. Use exponential instead.")]]
680  auto constexpr exp( Ex const& ex ) noexcept
681  {
682  return make_unary_operator( []<Tensor Tsor>( Tsor const& tsor ) noexcept
683  {
684  better_assert( !has_nan( tsor ), "forward propagation for operator exp: tensor contains Nan!" );
685  Tsor ans = tsor.deep_copy();
686  std::for_each( ans.data(), ans.data() + ans.size(), []( auto & v ){ v = std::exp(v); } );
687  return ans;
688  },
689  []<Tensor Tsor>( Tsor const&, Tsor const& output, Tsor const& grad ) noexcept
690  {
691  better_assert( !has_nan( grad ), "input gradient for operator exp contains NaN!" );
692  Tsor ans = grad;
693  grad *= output;
694  return ans;
695  },
696  "Exp"
697  )( ex );
698  }
699 #endif
700 
701  template <typename Float> requires std::floating_point<Float>
702  auto constexpr clip( Float lower, Float upper=std::numeric_limits<Float>::max() ) noexcept
703  {
704  return [lower, upper]<Expression Ex>( Ex const& ex ) noexcept
705  {
706  return make_unary_operator( [lower, upper]<Tensor Tsor>( Tsor const& tsor ) noexcept
707  {
708  better_assert( !has_nan( tsor ), "forward propagation for operator clip: tensor contains Nan!" );
709  Tsor ans = tsor.deep_copy();
710  clip( ans, lower, upper );
711  return ans;
712  },
713  [lower, upper]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
714  {
715  better_assert( !has_nan( grad ), "input gradient for operator clip contains NaN!" );
716  const typename Tsor::value_type zero{0};
717  Tsor ans = grad;
718  for ( auto idx : range( input.size() ) )
719  ans[idx] = (input[idx] < lower) ? zero :
720  (input[idx] > upper) ? zero :
721  ans[idx];
722  return ans;
723  },
724  "Clip"
725  )( ex );
726  };
727  }
728 
729  // include_batch_flag:
730  //
731  // true: considering the batch size at the first dim
732  // - for an input of (1, 3, 4), expecting an incoming expression of shape like [BS, 12, 1 1]
733  // - expected output of shape [BS, 1, 3, 4]
734  // false: do not consider the batch size
735  // - for an input of (1, 3, 4), expecting an incoming expression of shape like [12, 1]
736  // - expected output of shape [1, 3, 4]
737  auto inline reshape( std::vector<unsigned long> const& new_shape, bool include_batch_flag=true ) noexcept
738  {
739  return [new_shape, include_batch_flag]<Expression Ex>( Ex const& ex ) noexcept
740  {
741  return make_unary_operator
742  (
743  [new_shape, include_batch_flag]<Tensor Tsor>( Tsor const& tsor ) noexcept
744  {
745  unsigned long const new_size = std::accumulate( new_shape.begin(), new_shape.end(), 1UL, []( auto x, auto y ){ return x*y; } );
746  unsigned long const total_size = tsor.size();
747  unsigned long const batch_size = total_size / new_size;
748 
749  better_assert( batch_size * new_size == total_size, "size mismatch for reshape operator, expect ", batch_size*new_size, " but total input size is ", total_size, ", where batch_size is ", batch_size );
750 
751  if ( !include_batch_flag )
752  {
753  better_assert( batch_size == 1, "expecting batch size of 1 while not including batch, but got ", batch_size );
754  Tsor ans{tsor};
755  ans.reshape( new_shape );
756  return ans;
757  }
758 
759  std::vector<unsigned long> batched_new_shape;
760  {
761  batched_new_shape.resize( 1 + new_shape.size() );
762  batched_new_shape[0] = batch_size;
763  std::copy( new_shape.begin(), new_shape.end(), batched_new_shape.begin()+1 );
764  }
765 
766  Tsor ans{ tsor };
767  ans.reshape( batched_new_shape );
768  return ans;
769  },
770  []<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
771  {
772  Tsor ans{ grad };
773  ans.reshape( input.shape() );
774  return ans;
775  },
776  "Reshape"
777  )( ex );
778  };
779  }
780 
781  template <Expression Ex>
782  auto constexpr flatten( Ex const& ex ) noexcept
783  {
784  return make_unary_operator
785  (
786  []<Tensor Tsor>( Tsor const& tsor ) noexcept
787  {
788  better_assert( tsor.ndim() > 1, "Expecting dimension of incoming tensor to be greater than 1, but got ", tsor.ndim() );
789  unsigned long const batch_size = *(tsor.shape().begin());
790  unsigned long const rem = tsor.size() / batch_size;
791  Tsor ans = tsor;
792  return ans.reshape( {batch_size, rem} );
793  },
794  []<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
795  {
796  Tsor ans = grad;
797  return ans.reshape( input.shape() );
798  },
799  "Flatten"
800  )( ex );
801  }
802 
803  template <Expression Ex>
804  auto constexpr identity( Ex const& ex ) noexcept
805  {
806  return make_unary_operator
807  (
808  []<Tensor Tsor>( Tsor const& tsor ) noexcept
809  {
810  return tsor;
811  },
812  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
813  {
814  return grad;
815  },
816  "Identity"
817  )( ex );
818  }
819 
820  template< Expression Ex >
821  auto transpose( Ex const& ex ) noexcept
822  {
823  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
824  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
825  return make_unary_operator
826  (
827  [forward_cache]<Tensor Tsor>( Tsor const& tsor ) noexcept
828  {
829  better_assert( tsor.ndim() == 2, "Expecting 2D tensor, but got dimensions ", tsor.ndim() );
830 
831  typedef typename Tsor::value_type value_type;
832 
833  std::vector<unsigned long> const shape = tsor.shape();
834  auto const[row, col] = std::make_tuple( shape[0], shape[1] );
835  view_2d<value_type> v_in{ tsor.data(), row, col };
836 
837  Tsor& ans = context_cast<Tsor>( forward_cache );
838  ans.resize( {col, row} );
839  view_2d<value_type> v_out{ ans.data(), col, row };
840 
841  for ( auto r : range( row ) )
842  for ( auto c : range( col ) )
843  v_out[c][r] = v_in[r][c];
844 
845  return ans;
846  },
847  [backward_cache]<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
848  {
849  typedef typename Tsor::value_type value_type;
850 
851  std::vector<unsigned long> const shape = grad.shape();
852  auto const[row, col] = std::make_tuple( shape[0], shape[1] );
853  view_2d<value_type> v_in{ grad.data(), row, col };
854 
855  Tsor& back_ans = context_cast<Tsor>( backward_cache );
856  back_ans.resize( {col, row} );
857 
858  view_2d<value_type> v_out{ back_ans.data(), col, row };
859 
860  for ( auto r : range( row ) )
861  for ( auto c : range( col ) )
862  v_out[c][r] = v_in[r][c];
863 
864  return back_ans;
865  },
866  "Transpose"
867  )( ex );
868  }
869 
870  auto inline img2col( unsigned long const row_kernel, unsigned long col_kernel=-1,
871  unsigned long const row_padding=0, unsigned long col_padding=0,
872  unsigned long const row_stride=1, unsigned long const col_stride=1,
873  unsigned long const row_dilation=1, unsigned long const col_dilation=1 ) noexcept
874  {
875  if ( col_kernel == (unsigned long)-1 ) col_kernel = row_kernel;
876 
877  std::shared_ptr<std::vector<std::uint32_t>> s_index_record = std::make_shared<std::vector<std::uint32_t>>(); // col_img[idx] = img[index_record[idx]] -- (-1) for zero padding
878 
879  auto img2col_forward = [s_index_record]<Tensor Tsor>
880  (
881  Tsor const& input_img, Tsor& output_col_mat,
882  unsigned long kernel_row, unsigned long kernel_col,
883  unsigned long padding_row, unsigned long padding_col,
884  unsigned long stride_row, unsigned long stride_col,
885  unsigned long dilation_row, unsigned long dilation_col
886  ) noexcept
887  {
888  typedef typename Tsor::value_type value_type;
889  std::vector<std::uint32_t>& index_record = *s_index_record; //32 bit should be enough for memory address offeset
890 
891  std::vector<unsigned long> input_shape = input_img.shape();
892  better_assert( input_shape.size() == 4, "Expecting a 4D tensor." );
893  auto const [BS, R, C, CH] = std::make_tuple( input_shape[0], input_shape[1], input_shape[2], input_shape[3] );
894 
895  unsigned long const output_row = ( R + 2 * padding_row - ( dilation_row * (kernel_row - 1) + 1 ) ) / stride_row + 1;
896  unsigned long const output_col = ( C + 2 * padding_col - ( dilation_col * (kernel_col - 1) + 1 ) ) / stride_col + 1;
897  unsigned long const output_column_matrix_row = kernel_row * kernel_col * CH;
898  unsigned long const output_column_matrix_col = BS * output_row * output_col;
899 
900  output_col_mat.resize( {output_column_matrix_row, output_column_matrix_col} );
901 
902  if ( index_record.size() != output_column_matrix_row * output_column_matrix_col ) // first-run?
903  {
904  index_record.resize( output_column_matrix_row * output_column_matrix_col );
905 
906  for ( auto bs : range( BS ) )
907  {
908  std::int64_t const col_offset = bs * output_row * output_col * kernel_row * kernel_col * CH;
909  std::int64_t const im_offset = bs * R * C * CH;
910  for ( auto c : range( CH * kernel_row * kernel_col ) )
911  {
912  std::int64_t const w_offset = c % kernel_col;
913  std::int64_t const h_offset = ( c / kernel_col ) % kernel_row;
914  std::int64_t const c_im = c / ( kernel_col * kernel_row );
915 
916  for ( auto h : range( output_row ) )
917  {
918  std::int64_t const im_row_idx = h * stride_row - padding_row + h_offset * dilation_row;
919  for ( auto w : range( output_col ) )
920  {
921  std::int64_t const im_col_idx = w * stride_col - padding_col + w_offset * dilation_col;
922  std::int64_t const im_idx = im_offset+( im_row_idx * C + im_col_idx ) * CH + c_im;
923  std::int64_t const col_idx = col_offset+( c * output_row + h ) * output_col + w;
924  index_record[col_idx] = static_cast<std::uint32_t>((im_row_idx<0 || im_row_idx>=static_cast<std::int64_t>(R) || im_col_idx<0 || im_col_idx>=static_cast<std::int64_t>(C)) ? 0xffffffff : im_idx);
925  }
926  }
927  }
928  }
929  // re-arrange [bs, new_R, new_C] --> [new_R, new_c*bs]
930  {
931  std::vector<std::uint32_t> re_arranged_index;
932  re_arranged_index.resize( index_record.size() );
933 
934  view_3d<std::uint32_t> re_arranged_mat{ re_arranged_index.data(), output_column_matrix_row, BS, output_row*output_col };
935  view_3d<std::uint32_t> index_record_mat{ index_record.data(), BS, output_column_matrix_row, output_row*output_col };
936 
937  for ( auto bs : range( BS ) )
938  for ( auto r : range( output_column_matrix_row ) )
939  for ( auto c : range( output_row*output_col ) )
940  re_arranged_mat[r][bs][c] = index_record_mat[bs][r][c];
941  // overwrite index record
942  std::copy( re_arranged_index.begin(), re_arranged_index.end(), index_record.begin() );
943  }
944  }
945 
946  // fill-in
947  for ( auto idx : range( output_col_mat.size() ) )
948  {
949  auto const index = index_record[idx];
950  output_col_mat[idx] = (index == 0xffffffff) ? value_type{0} : input_img[index];
951  }
952  };
953 
954  auto img2col_backward = [s_index_record]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad, Tsor& ans ) noexcept
955  {
956  typedef typename Tsor::value_type value_type;
957  ans.resize( input.shape() );
958  std::fill( ans.begin(), ans.end(), value_type{0} );
959 
960  std::vector<std::uint32_t>& index_record = *s_index_record; //32 bit should be enough for memory address offeset
961  for ( auto idx : range( grad.size() ) )
962  {
963  auto const index = index_record[idx];
964  if ( index != 0xffffffff )
965  ans[index] += grad[idx];
966  }
967  };
968 
969  std::shared_ptr<std::any> output_cache = std::make_shared<std::any>();
970  std::shared_ptr<std::any> back_grad_cache = std::make_shared<std::any>();
971 
972  return [row_kernel, col_kernel, row_padding, col_padding, row_stride, col_stride, row_dilation, col_dilation, img2col_forward, img2col_backward, output_cache, back_grad_cache]<Expression Ex>( Ex const& ex ) noexcept
973  {
974  return make_unary_operator
975  (
976  [=]<Tensor Tsor>( Tsor const & tsor ) noexcept
977  {
978  Tsor& output = context_cast<Tsor>( output_cache );
979  img2col_forward( tsor, output, row_kernel, col_kernel, row_padding, col_padding, row_stride, col_stride, row_dilation, col_dilation );
980  return Tsor{output};
981  },
982  [=]<Tensor Tsor>( Tsor const& input, Tsor const& output, Tsor const& grad ) noexcept
983  {
984  Tsor& back_grad = context_cast<Tsor>( back_grad_cache );
985  img2col_backward( input, output, grad, back_grad );
986  return Tsor{back_grad};
987  },
988  "Img2Col"
989  )( ex );
990  };
991  }
992 
993  auto inline conv2d
994  (
995  unsigned long row_input, unsigned long col_input,
996  unsigned long const row_stride=1, unsigned long const col_stride=1,
997  unsigned long const row_dilation=1, unsigned long const col_dilation=1,
998  std::string const& padding="valid"
999  ) noexcept
1000  {
1001  // lhs_ex is for one 4D tensor of [BS, R, C, CH]
1002  // rhs_ex is for NC 4D filter of [1, r, c, CH], thus the shape is [NC, r, c, CH]
1003  // the output tensor is of shape [BS, .., .., NC]
1004  //
1005  // Note: the rhs expression is fixed as a variable, as we need to extract the kernel shape from it
1006  //
1007  //return [row_input, col_input, row_stride, col_stride, row_dilation, col_dilation, padding ]<Expression Ex, Variable Va>( Ex const& lhs_ex, Va const& rhs_ex ) noexcept
1008  return [row_input, col_input, row_stride, col_stride, row_dilation, col_dilation, padding ]<Expression Ex, Expression Ey>( Ex const& lhs_ex, Ey const& rhs_ex ) noexcept
1009  {
1010  std::vector<unsigned long> const& shape = rhs_ex.shape();
1011  better_assert( shape.size() == 4 );
1012  auto const[new_channel, row_kernel, col_kernel, channel] = std::make_tuple( shape[0], shape[1], shape[2], shape[3] );
1013  //TODO: optimization in case of small kernels of (1, 1), (3, 3)
1014  unsigned long row_padding = 0;
1015  unsigned long col_padding = 0;
1016  if ( padding == "same" )
1017  {
1018  unsigned long const row_padding_total = (row_kernel + (row_kernel - 1) * (row_dilation - 1) - row_stride);
1019  better_assert( !(row_padding_total & 0x1), "Expecting total row padding to be even, but got ", row_padding_total, " With row input ", row_input, " and row_stride ", row_stride );
1020  unsigned long const col_padding_total = (col_kernel + (col_kernel - 1) * (col_dilation - 1) - col_stride);
1021  better_assert( !(col_padding_total & 0x1), "Expecting total col padding to be even, but got ", col_padding_total );
1022  row_padding = ((row_kernel&1)+row_padding_total) >> 1;
1023  col_padding = ((col_kernel&1)+col_padding_total) >> 1;
1024  }
1025 
1026  unsigned long const row_output = ( row_input + 2 * row_padding - ( row_dilation * (row_kernel - 1) + 1 ) ) / row_stride + 1;
1027  unsigned long const col_output = ( col_input + 2 * row_padding - ( col_dilation * (col_kernel - 1) + 1 ) ) / col_stride + 1;
1028 
1029  auto lhs_ex_as_col = img2col(row_kernel, col_kernel, row_padding, col_padding, row_stride, col_stride, row_dilation, col_dilation)( lhs_ex ); // [BS, R, C, CH] ==> [r*c*CH, BS*new_row*new_col]
1030 
1031  auto rhs_ex_flatten = reshape({row_kernel*col_kernel*channel,})( rhs_ex ); // [NC, r, c, CH] ==> [NC, r*c*CH]
1032 
1033  auto flatten_output = rhs_ex_flatten * lhs_ex_as_col; // [NC, BS * new_row * new_col]
1034 
1035  auto tr_output = transpose( flatten_output ); // [BS*new_row*new_col, NC]
1036 
1037  auto ans = reshape({row_output, col_output, new_channel})( tr_output );
1038 
1039  return ans;
1040  };
1041  }
1042 
1043  template< typename T > requires std::floating_point<T>
1044  inline auto drop_out( T const factor ) noexcept
1045  {
1046  better_assert( factor < T{1}, "Expecting drop out rate less than 1, but got factor = ", factor );
1047  better_assert( factor > T{0}, "Expecting drop out rate greater than 0, but got factor = ", factor );
1048 
1049  std::shared_ptr<std::any> mask = std::make_shared<std::any>();
1050  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1051  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
1052 
1053  return [factor, mask, forward_cache, backward_cache]<Expression Ex>( Ex const& ex ) noexcept
1054  {
1055  return make_unary_operator
1056  (
1057  [factor, mask, forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
1058  {
1059  typedef typename Tsor::value_type value_type;
1060 
1061  if ( learning_phase == 0 ) // defined in 'config.hpp'
1062  return input;
1063 
1064  std::any& mask_ = *mask;
1065  // first run, initialize mask
1066  if ( !mask_.has_value() )
1067  {
1068  Tsor const random_tensor = random<value_type>( input.shape() );
1069  Tsor mask__{ input.shape() };
1070  for ( auto idx : range( input.size() ) )
1071  if ( random_tensor[ idx ] > factor )
1072  mask__[ idx ] = 1;
1073  mask_ = mask__; // initialize
1074  }
1075 
1076  Tsor& mask__ = std::any_cast<Tsor&>( mask_ );
1077 
1078  Tsor& ans = context_cast<Tsor>( forward_cache );
1079  ans.deep_copy( input );
1080 
1081  for ( auto idx : range( input.size() ) )
1082  ans[idx] *= mask__[idx] / (value_type{1} - factor);
1083  return ans;
1084  },
1085  [mask, backward_cache]<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
1086  {
1087  if ( learning_phase == 0 ) // defined in 'config.hpp'
1088  return grad;
1089 
1090  Tsor& mask__ = std::any_cast<Tsor&>( *mask );
1091 
1092  Tsor& ans = context_cast<Tsor>( backward_cache );
1093  ans.deep_copy( grad );
1094 
1095  for ( auto idx : range( grad.size() ) )
1096  ans[idx] *= mask__[idx];
1097  return ans;
1098  },
1099  "Dropout"
1100  )( ex );
1101  };
1102  }
1103 
1104 
1105  namespace
1106  {
1107 
1108  struct max_pooling_2d_context
1109  {
1110 
1111  auto make_forward() const noexcept
1112  {
1113  return []( unsigned long stride, std::shared_ptr<std::any> mask, std::shared_ptr<std::any> forward_cache ) noexcept
1114  {
1115  return [=]<Tensor Tsor>( Tsor const& input ) noexcept
1116  {
1117  typedef typename Tsor::value_type value_type;
1118  better_assert( input.ndim() == 4, "Expecting a 4D tensor, but got ", input.ndim() );
1119 
1120  Tsor& mask__ = context_cast<Tsor>( mask );
1121  mask__.resize( input.shape() );
1122 
1123 
1124  std::vector<unsigned long> shape = input.shape();
1125  auto const[batch_size, row, col, channel] = std::make_tuple(shape[0], shape[1], shape[2], shape[3]);
1126  Tsor input_ = input;
1127  view_4d<value_type> ts{ input_.data(), batch_size, row, col, channel };
1128  view_4d<value_type> tm{ mask__.data(), batch_size, row, col, channel };
1129 
1130  Tsor& ans = context_cast<Tsor>( forward_cache );
1131  ans.resize( {batch_size, row/stride, col/stride, channel} );
1132 
1133  view_4d<value_type> t1{ ans.data(), batch_size, row/stride, col/stride, channel };
1134 
1135  for ( auto bs : range(batch_size) )
1136  for ( auto r : range(row/stride) ) // row for t1
1137  for ( auto c : range(col/stride) ) // col for t1
1138  for ( auto ch : range(channel) )
1139  {
1140  unsigned long current_row_max = r * stride;
1141  unsigned long current_col_max = c * stride;
1142  for ( auto _r : range( (r*stride), ((r*stride)+stride) ) ) // row for ts
1143  for ( auto _c : range( (c*stride), ((c*stride)+stride) ) ) // col for ts
1144  {
1145  if ( ts[bs][_r][_c][ch] > ts[bs][current_row_max][current_col_max][ch] )
1146  {
1147  current_row_max = _r;
1148  current_col_max = _c;
1149  }
1150  }
1151  tm[bs][current_row_max][current_col_max][ch] = 1.0; //mark as max
1152  t1[bs][r][c][ch] = ts[bs][current_row_max][current_col_max][ch]; // update value
1153  }
1154  return ans;
1155  };
1156  };
1157  }
1158 
1159  auto make_backward() const noexcept
1160  {
1161  return []( unsigned long stride, std::shared_ptr<std::any> mask, std::shared_ptr<std::any> backward_cache ) noexcept
1162  {
1163  return [=]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
1164  {
1165  typedef typename Tsor::value_type value_type;
1166  std::vector<unsigned long> const& shape = input.shape();
1167  auto const[batch_size, row, col, channel] = std::make_tuple(shape[0], shape[1], shape[2], shape[3]);
1168 
1169  Tsor& mask__ = std::any_cast<Tsor&>( *mask );
1170  view_4d<value_type> tm{ mask__.data(), batch_size, row, col, channel };
1171 
1172  Tsor& ans = context_cast<Tsor>( backward_cache );
1173  ans.resize( input.shape() );
1174 
1175  view_4d<value_type> ta{ ans.data(), batch_size, row, col, channel };
1176 
1177  Tsor grad_ = grad;
1178  view_4d<value_type> tg{ grad_.data(), batch_size, row/stride, col/stride, channel };
1179 
1180  for ( auto bs : range( batch_size ) )
1181  for ( auto r : range( row ) )
1182  for ( auto c : range( col ) )
1183  for ( auto ch : range( channel ) )
1184  if ( std::abs(tm[bs][r][c][ch] - 1.0) < 1.0e-5 )
1185  ta[bs][r][c][ch] = tg[bs][r/stride][c/stride][ch];
1186  return ans;
1187  };
1188  };
1189  }
1190 
1191  }; // max_pooling_2d_context
1192 
1193  } // anonymous namespace
1194 
1195 
1196  // comment: maybe using function 'reduce' to reduce the cod complexity? at a price of performance?
1197  inline auto max_pooling_2d( unsigned long stride ) noexcept
1198  {
1199  better_assert( stride > 1, "Expecting max_pooling_2d stride greater than 1, but got ", stride );
1200 
1201  std::shared_ptr<std::any> mask = std::make_shared<std::any>();
1202  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1203  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
1204 
1205  return [stride, mask, forward_cache, backward_cache]<Expression Ex>( Ex const& ex ) noexcept
1206  {
1207  return make_unary_operator
1208  (
1209  max_pooling_2d_context{}.make_forward()( stride, mask, forward_cache ),
1210  max_pooling_2d_context{}.make_backward()( stride, mask, backward_cache ),
1211  "MaxPooling2D"
1212  )( ex );
1213  };
1214  }
1215 
1216  inline auto average_pooling_2d( unsigned long stride ) noexcept
1217  {
1218  better_assert( stride > 1, "Expecting average_pooling_2d stride greater than 1, but got ", stride );
1219 
1220  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1221  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
1222 
1223  return [stride, forward_cache, backward_cache]<Expression Ex>( Ex const& ex ) noexcept
1224  {
1225  return make_unary_operator
1226  (
1227  [stride, forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept // [BS, R, C, CH] --> [BS, R/s, C/s, CH]
1228  {
1229  typedef typename Tsor::value_type value_type;
1230  better_assert( input.ndim() == 4, "Expecting a 4D tensor, but got ", input.ndim() );
1231 
1232  std::vector<unsigned long> shape = input.shape();
1233  auto const[batch_size, row, col, channel] = std::make_tuple(shape[0], shape[1], shape[2], shape[3]);
1234  Tsor input_ = input;
1235  view_4d<value_type> ts{ input_.data(), batch_size, row, col, channel };
1236 
1237  Tsor& ans = context_cast<Tsor>( forward_cache );
1238  ans.resize( {batch_size, row/stride, col/stride, channel} );
1239  std::fill( ans.begin(), ans.end(), value_type{0} );
1240 
1241  view_4d<value_type> t1{ ans.data(), batch_size, row/stride, col/stride, channel };
1242 
1243  value_type const factor = value_type{1} / static_cast<value_type>(stride*stride);
1244  for ( auto bs : range(batch_size) )
1245  for ( auto r : range(row/stride) ) // row for t1
1246  for ( auto c : range(col/stride) ) // col for t1
1247  for ( auto ch : range(channel) )
1248  for ( auto _r : range( (r*stride), ((r*stride)+stride) ) ) // row for ts
1249  for ( auto _c : range( (c*stride), ((c*stride)+stride) ) ) // col for ts
1250  t1[bs][r][c][ch] += ts[bs][_r][_c][ch] * factor;
1251  return ans;
1252  },
1253  [stride, backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
1254  {
1255  typedef typename Tsor::value_type value_type;
1256  std::vector<unsigned long> const& shape = input.shape();
1257  auto const[batch_size, row, col, channel] = std::make_tuple(shape[0], shape[1], shape[2], shape[3]);
1258 
1259  Tsor& ans = context_cast<Tsor>( backward_cache );
1260  ans.resize( input.shape() );
1261 
1262  view_4d<value_type> ta{ ans.data(), batch_size, row, col, channel };
1263 
1264  Tsor grad_ = grad;
1265  view_4d<value_type> tg{ grad_.data(), batch_size, row/stride, col/stride, channel };
1266 
1267  value_type const factor = value_type{1} / static_cast<value_type>(stride*stride);
1268  for ( auto bs : range( batch_size ) )
1269  for ( auto r : range( row ) )
1270  for ( auto c : range( col ) )
1271  for ( auto ch : range( channel ) )
1272  ta[bs][r][c][ch] = factor * tg[bs][r/stride][c/stride][ch];
1273  return ans;
1274  },
1275  "AveragePooling2D"
1276  )( ex );
1277  };
1278  }
1279 
1280  namespace
1281  {
1282  struct up_sampling_2d_context
1283  {
1284  auto make_forward() const noexcept
1285  {
1286  return []( unsigned long stride, std::shared_ptr<std::any> forward_cache ) noexcept
1287  {
1288  return [=]<Tensor Tsor>( Tsor const& input ) noexcept
1289  {
1290  typedef typename Tsor::value_type value_type;
1291  better_assert( input.ndim() == 4, "Expecting a 4D tensor, but got ", input.ndim() );
1292 
1293  std::vector<unsigned long> shape = input.shape();
1294  auto const[batch_size, row, col, channel] = std::make_tuple(shape[0], shape[1], shape[2], shape[3]);
1295  Tsor input_ = input;
1296  view_4d<value_type> ts{ input_.data(), batch_size, row, col, channel };
1297 
1298  Tsor& ans = context_cast<Tsor>( forward_cache );
1299  ans.resize( {batch_size, row*stride, col*stride, channel} );
1300  std::fill( ans.begin(), ans.end(), value_type{0} );
1301 
1302  view_4d<value_type> t1{ ans.data(), batch_size, row*stride, col*stride, channel };
1303 
1304  for ( auto bs : range(batch_size) )
1305  for ( auto r : range(row) ) // row for ts
1306  for ( auto c : range(col) ) // col for ts
1307  for ( auto ch : range(channel) )
1308  for ( auto _r : range( (r*stride), ((r*stride)+stride) ) ) // row for t1
1309  for ( auto _c : range( (c*stride), ((c*stride)+stride) ) ) // col for t1
1310  t1[bs][_r][_c][ch] = ts[bs][r][c][ch];
1311  return ans;
1312  };
1313  };
1314  }
1315 
1316  auto make_backward() const noexcept
1317  {
1318  return []( unsigned long stride, std::shared_ptr<std::any> backward_cache ) noexcept
1319  {
1320  return [=]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
1321  {
1322  typedef typename Tsor::value_type value_type;
1323  std::vector<unsigned long> const& shape = input.shape();
1324  auto const[batch_size, row, col, channel] = std::make_tuple(shape[0], shape[1], shape[2], shape[3]);
1325 
1326  Tsor& ans = context_cast<Tsor>( backward_cache );
1327  ans.resize( input.shape() );
1328  std::fill( ans.begin(), ans.end(), value_type{0} );
1329 
1330  view_4d<value_type> ta{ ans.data(), batch_size, row, col, channel };
1331 
1332  Tsor grad_ = grad;
1333  view_4d<value_type> tg{ grad_.data(), batch_size, row*stride, col*stride, channel };
1334 
1335  for ( auto bs : range( batch_size ) )
1336  for ( auto r : range( row ) )
1337  for ( auto c : range( col ) )
1338  for ( auto ch : range( channel ) )
1339  for ( auto _r : range( (r*stride), ((r*stride)+stride) ) ) // row for tg
1340  for ( auto _c : range( (c*stride), ((c*stride)+stride) ) ) // col for tg
1341  ta[bs][r][c][ch] += tg[bs][_r][_c][ch];
1342  return ans;
1343  };
1344  };
1345  }
1346  }; // up_sampling_2d_context
1347 
1348  } // anonymous namespace
1349 
1350  inline auto up_sampling_2d( unsigned long stride ) noexcept
1351  {
1352  better_assert( stride > 1, "Expecting up_sampling_pooling_2d stride greater than 1, but got ", stride );
1353 
1354  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1355  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
1356 
1357  return [stride, forward_cache, backward_cache]<Expression Ex>( Ex const& ex ) noexcept
1358  {
1359  return make_unary_operator
1360  (
1361  up_sampling_2d_context{}.make_forward()( stride, forward_cache ),
1362  up_sampling_2d_context{}.make_backward()( stride, backward_cache ),
1363  "UpSampling2D"
1364  )( ex );
1365  };
1366  }
1367 
1368 
1369 
1370  template< typename T=double > requires std::floating_point<T>
1371  inline auto normalization_batch( T const momentum=0.98 ) noexcept
1372  {
1373  std::shared_ptr<std::any> global_average_cache = std::make_shared<std::any>();
1374  std::shared_ptr<std::any> global_variance_cache = std::make_shared<std::any>();
1375  std::shared_ptr<std::any> average_cache = std::make_shared<std::any>();
1376  std::shared_ptr<std::any> variance_cache = std::make_shared<std::any>();
1377  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1378  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
1379 
1380  return [=]<Expression Ex>( Ex const& ex ) noexcept
1381  {
1382  return make_unary_operator
1383  (
1384  [=]<Tensor Tsor>( Tsor const& input ) noexcept
1385  {
1386  better_assert( input.ndim() > 1, "normalization_batch requires input dimension at least 2, got ", input.ndim() );
1387 
1388  typedef typename Tsor::value_type value_type;
1389  //typedef typename Tsor::allocator allocator;
1390 
1391  std::vector<unsigned long> const& shape = input.shape();
1392  unsigned long const channels = *(shape.rbegin());
1393  unsigned long const rest_dims = input.size() / channels;
1394 
1395  view_2d<value_type> input_{ input.data(), rest_dims, channels };
1396 
1397  // case of prediction phase, in this phase, the batch size could be 1, and it is not possible to calculate the variance
1398  if ( learning_phase == 0 ) // defined in 'config.hpp'
1399  {
1400  // fix for the special case when prediction is executed before the training, typically in a GAN
1401  Tsor& global_average_test = context_cast<Tsor>( global_average_cache );
1402  if ( global_average_test.empty() )
1403  return input;
1404 
1405  // normal case. i.e., the global_average_cache and global_variance_cache are not empty
1406  Tsor& global_average = context_extract<Tsor>( global_average_cache );
1407  Tsor& global_variance = context_extract<Tsor>( global_variance_cache );
1408 
1409  Tsor& ans = context_cast<Tsor>( forward_cache, zeros_like( input ) );
1410  ans.resize( input.shape() ); // well, the batch sizes for training and for prediction are not necessarily same
1411 
1412  view_2d<value_type> ans_{ ans.data(), rest_dims, channels };
1413  {
1414  for ( auto r : range( rest_dims ) )
1415  for ( auto c : range( channels ) )
1416  ans_[r][c] = (input_[r][c] - global_average[c]) / std::sqrt( global_variance[c] + eps );
1417  }
1418  return ans;
1419  }
1420 
1421  //calculate average along the last channel
1422  Tsor& average = context_cast<Tsor>( average_cache );
1423  {
1424  average.resize( {channels, } );
1425  std::fill( average.begin(), average.end(), value_type{0} );
1426 
1427  for ( auto idx : range( rest_dims ) )
1428  for ( auto jdx : range( channels ) )
1429  average[jdx] += input_[idx][jdx];
1430 
1431  average /= static_cast<value_type>(rest_dims);
1432  }
1433 
1434  //calculate Variance along the last channel
1435  Tsor& variance = context_cast<Tsor>( variance_cache );
1436  {
1437  variance.resize( {channels,} );
1438  std::fill( variance.begin(), variance.end(), value_type{0} );
1439  for ( auto idx : range( rest_dims ) )
1440  for ( auto jdx : range( channels ) )
1441  variance[jdx] += std::pow( input_[idx][jdx] - average[jdx], 2 );
1442 
1443  variance /= static_cast<value_type>( rest_dims );
1444  }
1445 
1446 
1447  Tsor& ans = context_cast<Tsor>( forward_cache );
1448  ans.resize( input.shape() ); // the batch sizes for training and for prediction are not necessarily same
1449  view_2d<value_type> ans_{ ans.data(), rest_dims, channels };
1450  {
1451  for ( auto idx : range( rest_dims ) )
1452  for ( auto jdx : range( channels ) )
1453  ans_[idx][jdx] = ( input_[idx][jdx] - average[jdx] ) / std::sqrt( variance[jdx] + eps );
1454  }
1455 
1456  // update global average and global variance
1457  {
1458  Tsor& global_average = context_cast<Tsor>( global_average_cache, zeros_like( average ) );
1459  // Note: No obvious different is observed between initializing global_variance to zeros and to ones with MNIST example:
1460  // initializing global_variance to zeros, after 10 epochs mnist gives an error of 0.026
1461  // initializing global_variance to ones, after 10 epochs mnist gives an error of 0.028
1462  Tsor& global_variance = context_cast<Tsor>( global_variance_cache, zeros_like( variance ) );
1463  //Tsor& global_variance = context_cast<Tsor>( global_variance_cache, ones_like( variance ) );
1464  for ( auto idx : range( global_average.size() ) )
1465  {
1466  global_average[idx] = global_average[idx] * momentum + average[idx] * ( 1.0 - momentum );
1467  global_variance[idx] = global_variance[idx] * momentum + variance[idx] * ( 1.0 - momentum );
1468  }
1469  }
1470 
1471  return ans;
1472  },
1473 
1474  [=]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
1475  {
1476  typedef typename Tsor::value_type value_type;
1477  Tsor& variance = context_extract<Tsor>( variance_cache );
1478 
1479  std::vector<unsigned long> const& shape = input.shape();
1480  unsigned long const channels = *(shape.rbegin());
1481  unsigned long const rest_dims = input.size() / channels;
1482 
1483  Tsor& ans = context_cast<Tsor>( backward_cache, zeros_like( input ) );
1484  view_2d<value_type> ans_{ans.data(), rest_dims, channels };
1485  view_2d<value_type> grad_{grad.data(), rest_dims, channels };
1486  for ( auto r : range( rest_dims ) )
1487  for ( auto c : range( channels ) )
1488  ans_[r][c] = grad_[r][c] / std::sqrt( variance[c] + eps );
1489  return ans;
1490  },
1491  "Normalization"
1492  )( ex );
1493  };
1494  }
1495 
1496 
1497 
1498  template< typename T > requires std::floating_point<T>
1499  inline auto batch_normalization( T const momentum=0.98 ) noexcept
1500  {
1501  return [=]<Expression Ex, Variable Va>( Ex const& ex, Va const& gamma, Va const& beta ) noexcept
1502  {
1503  return elementwise_product( normalization_batch(momentum)(ex), gamma ) + beta; // multiply and sum along the batch: normalization is of shape [BS, R, C, CH], gamma/beta are of shape [R, C, CH]
1504  };
1505  }
1506 
1507 
1508 
1509  //
1510  // example:
1511  //
1512  // variable<tensor<float>> a {... };
1513  // variable<tensor<float>> b {... };
1514  // auto cab = concatenate( a, b )();
1515  //
1516  template< Expression Lhs_Expression, Expression Rhs_Expression >
1517  auto constexpr concatenate( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
1518  {
1519  return [&]( unsigned long axe = -1 ) noexcept
1520  {
1521  return make_binary_operator
1522  (
1523  [axe]<Tensor Tsor>( Tsor const& lhs_tensor, Tsor const& rhs_tensor ) noexcept
1524  {
1525  return concatenate( lhs_tensor, rhs_tensor, axe );
1526  },
1527  [axe]<Tensor Tsor>( Tsor const& lhs_input, Tsor const& rhs_input, Tsor const&, Tsor const grad ) noexcept
1528  {
1529  typedef typename Tsor::value_type value_type;
1530 
1531  Tsor l_ans{ lhs_input.shape() };
1532  Tsor r_ans{ rhs_input.shape() };
1533  better_assert( l_ans.size() + r_ans.size() == grad.size(), "size mismatch: lhs size is ", l_ans.size(), " rhs size is ", r_ans.size(), " and grad size is ", grad.size(),
1534  " with lhs dim is ", l_ans.ndim(), " and rhs dim is ", r_ans.ndim() );
1535 
1536  // 2D view of grad
1537  unsigned long const ax = (axe == (unsigned long)(-1)) ? grad.ndim()-1 : axe;
1538  unsigned long const g_col = std::accumulate( grad.shape().begin()+ax, grad.shape().end(), 1UL, []( unsigned long x, unsigned long y ){ return x*y; } );
1539  unsigned long const g_row = grad.size() / g_col;
1540  view_2d<value_type> v_g{ grad.data(), g_row, g_col };
1541 
1542  // 2D view of l_ans
1543  unsigned long const lhs_row = g_row;
1544  unsigned long const lhs_col = lhs_input.size() / lhs_row;
1545  view_2d<value_type> v_l{ l_ans.data(), lhs_row, lhs_col };
1546 
1547  // 2D view of r_ans
1548  unsigned long const rhs_row = g_row;
1549  unsigned long const rhs_col = rhs_input.size() / rhs_row;
1550  view_2d<value_type> v_r{ r_ans.data(), rhs_row, rhs_col };
1551 
1552  better_assert( g_col == lhs_col + rhs_col, "last dimension not agree" );
1553 
1554  for ( unsigned long idx = 0; idx != g_row; ++idx )
1555  {
1556  std::copy( v_g[idx], v_g[idx]+lhs_col, v_l[idx] ); // fill idx-th row of 'v_l'
1557  std::copy( v_g[idx]+lhs_col, v_g[idx]+g_col, v_r[idx] ); // fill idx-th row of 'v_r'
1558  }
1559 
1560  return std::make_tuple( l_ans, r_ans );
1561  },
1562  "Concatenate"
1563  )( lhs_ex, rhs_ex );
1564  };
1565  }
1566 
1567  // just to keep this interface agrees with Keras
1568  inline auto concatenate( unsigned long axe = -1 )
1569  {
1570 
1571  return [=]< Expression Lhs_Expression, Expression Rhs_Expression >( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
1572  {
1573  return concatenate( lhs_ex, rhs_ex )( axe );
1574  };
1575  }
1576 
1577  // alias of 'concatenate'
1578  template< Expression Lhs_Expression, Expression Rhs_Expression >
1579  auto constexpr concat( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
1580  {
1581  return concatenate( lhs_ex, rhs_ex )();
1582  }
1583 
1584  // alias of 'concatenate'
1585  inline auto concat( unsigned long axe = -1 )
1586  {
1587  return concatenate( axe );
1588  }
1589 
1590  template< Expression Lhs_Expression, Expression Rhs_Expression >
1591  auto constexpr maximum( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
1592  {
1593  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1594  std::shared_ptr<std::any> mask_cache = std::make_shared<std::any>();
1595  std::shared_ptr<std::any> backward_cache_lhs = std::make_shared<std::any>();
1596  std::shared_ptr<std::any> backward_cache_rhs = std::make_shared<std::any>();
1597  return make_binary_operator
1598  (
1599  [=]<Tensor Tsor>( Tsor const& lhs_tensor, Tsor const& rhs_tensor ) noexcept
1600  {
1601  better_assert( lhs_tensor.shape() == rhs_tensor.shape(), "tensor shape mismatch." );
1602 
1603  Tsor& ans = context_cast<Tsor>( forward_cache );
1604  ans.resize( lhs_tensor.shape() );
1605  Tsor& mask = context_cast<Tsor>( mask_cache ); // 1 if lhs element is larger, 0 if rhs element is larger
1606  mask.resize( lhs_tensor.shape() );
1607 
1608  for_each( lhs_tensor.begin(), lhs_tensor.end(), rhs_tensor.begin(), ans.begin(), mask.begin(), []( auto const l, auto const r, auto& a, auto& m ) { m = l > r ? 1.0 : 0.0; a = l > r ? l : r; } );
1609 
1610  return ans;
1611  },
1612  [=]<Tensor Tsor>( Tsor const& lhs_input, Tsor const& rhs_input, Tsor const&, Tsor const& grad ) noexcept
1613  {
1614  Tsor& mask = context_cast<Tsor>( mask_cache ); // 1 if lhs element is larger, 0 if rhs element is larger
1615 
1616  Tsor& l_ans = context_cast<Tsor>( backward_cache_lhs );
1617  l_ans.resize( lhs_input.shape() );
1618  Tsor& r_ans = context_cast<Tsor>( backward_cache_rhs );
1619  r_ans.resize( rhs_input.shape() );
1620 
1621  for_each( grad.begin(), grad.end(), mask.begin(), l_ans.begin(), r_ans.begin(), []( auto const g, auto const m, auto& l, auto& r ) { if ( m > 0.5 ) { l = g; r = 0.0; } else { l = 0.0; r = g; } } );
1622 
1623  return std::make_tuple( l_ans, r_ans );
1624  },
1625  "Maximum"
1626  )( lhs_ex, rhs_ex );
1627  }
1628 
1629  template< Expression Lhs_Expression, Expression Rhs_Expression >
1630  auto constexpr minimum( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
1631  {
1632  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1633  std::shared_ptr<std::any> mask_cache = std::make_shared<std::any>();
1634  std::shared_ptr<std::any> backward_cache_lhs = std::make_shared<std::any>();
1635  std::shared_ptr<std::any> backward_cache_rhs = std::make_shared<std::any>();
1636  return make_binary_operator
1637  (
1638  [=]<Tensor Tsor>( Tsor const& lhs_tensor, Tsor const& rhs_tensor ) noexcept
1639  {
1640  better_assert( lhs_tensor.shape() == rhs_tensor.shape(), "tensor shape mismatch." );
1641 
1642  Tsor& ans = context_cast<Tsor>( forward_cache );
1643  ans.resize( lhs_tensor.shape() );
1644  Tsor& mask = context_cast<Tsor>( mask_cache ); // 1 if lhs element is larger, 0 if rhs element is larger
1645  mask.resize( lhs_tensor.shape() );
1646 
1647  for_each( lhs_tensor.begin(), lhs_tensor.end(), rhs_tensor.begin(), ans.begin(), mask.begin(), []( auto const l, auto const r, auto& a, auto& m ) { m = l > r ? 0.0: 1.0 ; a = l > r ? r: l; } );
1648 
1649  return ans;
1650  },
1651  [=]<Tensor Tsor>( Tsor const& lhs_input, Tsor const& rhs_input, Tsor const&, Tsor const& grad ) noexcept
1652  {
1653  Tsor& mask = context_cast<Tsor>( mask_cache ); // 1 if lhs element is larger, 0 if rhs element is larger
1654 
1655  Tsor& l_ans = context_cast<Tsor>( backward_cache_lhs );
1656  l_ans.resize( lhs_input.shape() );
1657  Tsor& r_ans = context_cast<Tsor>( backward_cache_rhs );
1658  r_ans.resize( rhs_input.shape() );
1659 
1660  for_each( grad.begin(), grad.end(), mask.begin(), l_ans.begin(), r_ans.begin(), []( auto const g, auto const m, auto& l, auto& r ) { if ( m < 0.5 ) { l = g; r = 0.0; } else { l = 0.0; r = g; } } );
1661 
1662  return std::make_tuple( l_ans, r_ans );
1663  },
1664  "Minmum"
1665  )( lhs_ex, rhs_ex );
1666  }
1667 
1671  template< Expression Lhs_Expression, Expression Rhs_Expression >
1672  auto constexpr atan2( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
1673  {
1674  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1675  std::shared_ptr<std::any> backward_cache_lhs = std::make_shared<std::any>();
1676  std::shared_ptr<std::any> backward_cache_rhs = std::make_shared<std::any>();
1677  return make_binary_operator
1678  (
1679  [=]<Tensor Tsor>( Tsor const& lhs_tensor, Tsor const& rhs_tensor ) noexcept
1680  {
1681  better_assert( lhs_tensor.shape() == rhs_tensor.shape(), "tensor shape mismatch." );
1682  Tsor& ans = context_cast<Tsor>( forward_cache );
1683  ans.resize( lhs_tensor.shape() );
1684  for_each( lhs_tensor.begin(), lhs_tensor.end(), rhs_tensor.begin(), ans.begin(), []( auto const l, auto const r, auto& a ) { a = std::atan2(l, r); } );
1685  return ans;
1686  },
1687  [=]<Tensor Tsor>( Tsor const& lhs_input, Tsor const& rhs_input, Tsor const&, Tsor const& grad ) noexcept
1688  {
1689  Tsor& l_ans = context_cast<Tsor>( backward_cache_lhs );
1690  l_ans.resize( lhs_input.shape() );
1691  Tsor& r_ans = context_cast<Tsor>( backward_cache_rhs );
1692  r_ans.resize( rhs_input.shape() );
1693  for_each( grad.begin(), grad.end(), l_ans.begin(), r_ans.begin(), lhs_input.begin(), rhs_input.begin(), []( auto const g, auto& l, auto& r, auto const x, auto const y ) { auto const c = x*x+y*y; l = -g*y/c; r = g*x/c; } );
1694  return std::make_tuple( l_ans, r_ans );
1695  },
1696  "Arctan2"
1697  )( lhs_ex, rhs_ex );
1698  }
1699 
1700 
1713  template< typename T=float > requires std::floating_point<T>
1714  inline auto random_normal_like( T mean = 0.0, T stddev = 1.0 ) noexcept
1715  {
1716  return [=]<Expression Ex>(Ex const& ex ) noexcept
1717  {
1718  return make_unary_operator
1719  (
1720  [=]<Tensor Tsor>( Tsor const& tsor ) noexcept
1721  {
1722  //debug_log( "Trying to generate random variables from a normal distribution of mean ", mean, " and stddev ", stddev );
1723  return randn_like( tsor, mean, stddev );
1724  },
1725  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
1726  {
1727  return zeros_like( grad );
1728  },
1729  "RandomNormalLike"
1730  )(ex);
1731  };
1732  }
1733 
1743  template< Expression Ex>
1744  auto ones_like( Ex const& ex ) noexcept
1745  {
1746  return make_unary_operator
1747  (
1748  []<Tensor Tsor>( Tsor const& tsor ) noexcept { return ones_like( tsor ); },
1749  []<Tensor Tsor>( Tsor const&, Tsor const& , Tsor const& grad ) noexcept { return zeros_like( grad ); },
1750  "OnesLike"
1751  )(ex);
1752  }
1753 
1763  template< Expression Ex>
1764  auto zeros_like( Ex const& ex ) noexcept
1765  {
1766  return make_unary_operator
1767  (
1768  []<Tensor Tsor>( Tsor const& tsor ) noexcept { return zeros_like( tsor ); },
1769  []<Tensor Tsor>( Tsor const&, Tsor const& , Tsor const& grad ) noexcept { return zeros_like( grad ); },
1770  "ZerosLike"
1771  )(ex);
1772  }
1773 
1788  template< Expression Lhs_Expression, Expression Rhs_Expression, std::floating_point FP >
1789  auto constexpr equal( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex, FP threshold=0.5 ) noexcept
1790  {
1791  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1792  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
1793  return make_binary_operator
1794  (
1795  [=]<Tensor Tsor>( Tsor const& lhs_tensor, Tsor const& rhs_tensor ) noexcept
1796  {
1797  typedef typename Tsor::value_type value_type;
1798  better_assert( lhs_tensor.shape() == rhs_tensor.shape(), "equal: tensor shape mismatch." );
1799 
1800  Tsor& ans = context_cast<Tsor>( forward_cache );
1801  ans.resize( lhs_tensor.shape() );
1802  for_each( lhs_tensor.begin(), lhs_tensor.end(), rhs_tensor.begin(), ans.begin(), [threshold]( auto l, auto r, auto& v ){ v = (std::abs(l-r) > threshold) ? value_type{0} : value_type{1}; } );
1803  return ans;
1804  },
1805  [=]<Tensor Tsor>( Tsor const& lhs_input, Tsor const& rhs_input, Tsor const&, Tsor const& grad ) noexcept
1806  {
1807  typedef typename Tsor::value_type value_type;
1808  Tsor& ans = context_cast<Tsor>( backward_cache );
1809  std::fill( ans.begin(), ans.end(), value_type{0} );
1810  return std::make_tuple( ans, ans );
1811  },
1812  "Equal"
1813  )( lhs_ex, rhs_ex );
1814  }
1815 
1828  template <Expression Ex>
1829  auto constexpr sign( Ex const& ex ) noexcept
1830  {
1831  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1832  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
1833  return make_unary_operator
1834  (
1835  [=]<Tensor Tsor>( Tsor const& input ) noexcept
1836  {
1837  typedef typename Tsor::value_type value_type;
1838  Tsor& ans = context_cast<Tsor>( forward_cache );
1839  ans.resize( input.shape() );
1840  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ){ v = (value_type{0} < x) - (x < value_type{0}); } );
1841  return ans;
1842  },
1843  [=]<Tensor Tsor>( Tsor const&input, Tsor const&, Tsor const& grad ) noexcept
1844  {
1845  typedef typename Tsor::value_type value_type;
1846  Tsor& ans = context_cast<Tsor>( backward_cache );
1847  ans.resize( input.shape() );
1848  std::fill( ans.begin(), ans.end(), value_type{0} ); //TF gives zeros, we follow TF here
1849  return ans;
1850  },
1851  "Sign"
1852  )( ex );
1853  };
1854 
1855 
1856 
1857  namespace
1858  {
1859  struct zero_padding_2d_context
1860  {
1861  auto make_forward() const noexcept
1862  {
1863  return []( unsigned long top, unsigned long bottom, unsigned long left, unsigned long right, std::shared_ptr<std::any> forward_cache ) noexcept
1864  {
1865  return [=]<Tensor Tsor>( Tsor const& input ) noexcept
1866  {
1867  typedef typename Tsor::value_type value_type;
1868  better_assert( input.ndim() == 4, "Expecting a 4D tensor, but got ", input.ndim() );
1869 
1870  // 4D view of input tensor
1871  std::vector<unsigned long> shape = input.shape();
1872  auto const[batch_size, row, col, channel] = std::make_tuple(shape[0], shape[1], shape[2], shape[3]);
1873  Tsor input_ = input;
1874  view_4d<value_type> ts{ input_.data(), batch_size, row, col, channel };
1875 
1876  // 4D view of output tensor
1877  Tsor& ans = context_cast<Tsor>( forward_cache );
1878  ans.resize( {batch_size, top+row+bottom, left+col+right, channel} );
1879  view_4d<value_type> ta{ ans.data(), batch_size, top+row+bottom, left+col+right, channel };
1880 
1881  for ( auto bs : range( batch_size ) )
1882  for ( auto r : range( row ) )
1883  for ( auto c : range( col ) )
1884  for ( auto ch : range( channel ) )
1885  ta[bs][top+r][left+c][ch] = ts[bs][r][c][ch];
1886 
1887  return ans;
1888  };
1889  };
1890  }
1891 
1892  auto make_backward() const noexcept
1893  {
1894  return []( unsigned long top, unsigned long bottom, unsigned long left, unsigned long right, std::shared_ptr<std::any> backward_cache ) noexcept
1895  {
1896  return [=]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
1897  {
1898  typedef typename Tsor::value_type value_type;
1899  std::vector<unsigned long> const& shape = input.shape();
1900  auto const[batch_size, row, col, channel] = std::make_tuple(shape[0], shape[1], shape[2], shape[3]);
1901 
1902  Tsor& ans = context_cast<Tsor>( backward_cache );
1903  ans.resize( input.shape() );
1904  std::fill( ans.begin(), ans.end(), value_type{0} );
1905 
1906  view_4d<value_type> ta{ ans.data(), batch_size, row, col, channel };
1907 
1908  Tsor grad_ = grad;
1909  view_4d<value_type> tg{ grad_.data(), batch_size, top+row+bottom, left+col+right, channel };
1910 
1911  for ( auto bs : range( batch_size ) )
1912  for ( auto r : range( row ) )
1913  for ( auto c : range( col ) )
1914  for ( auto ch : range( channel ) )
1915  ta[bs][r][c][ch] = tg[bs][r+top][c+left][ch];
1916  return ans;
1917  };
1918  };
1919  }
1920  }; // zero_padding_2d_context
1921  }//anonymouse namespace
1922 
1936  inline auto zero_padding_2d( std::vector<unsigned long> const& padding ) noexcept
1937  {
1938  // extracting paddings
1939  unsigned long top, bottom, left, right;
1940  if ( padding.size() == 1 )
1941  std::tie( top, bottom, left, right ) = std::make_tuple( padding[0], padding[0], padding[0], padding[0] );
1942  else if (padding.size() == 2 )
1943  std::tie( top, bottom, left, right ) = std::make_tuple( padding[0], padding[0], padding[1], padding[1] );
1944  else if (padding.size() == 4 )
1945  std::tie( top, bottom, left, right ) = std::make_tuple( padding[0], padding[1], padding[2], padding[3] );
1946  else
1947  better_assert( false, "Expecting padding has size of 1, 2 or 4, but got: ", padding.size() );
1948 
1949  // checking extracted paddings
1950  better_assert( top >= 1, "Expecting zero_padding_2d top padding no less than 1, but got ", top );
1951  better_assert( bottom >= 1, "Expecting zero_padding_2d bottom padding no less than 1, but got ", bottom );
1952  better_assert( left >= 1, "Expecting zero_padding_2d left padding no less than 1, but got ", left );
1953  better_assert( right >= 1, "Expecting zero_padding_2d right padding no less than 1, but got ", right );
1954 
1955  // to avoid re-allocating memory for tensors
1956  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
1957  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
1958 
1959  return [top, bottom, left, right, forward_cache, backward_cache]<Expression Ex>( Ex const& ex ) noexcept
1960  {
1961  return make_unary_operator
1962  (
1963  zero_padding_2d_context{}.make_forward()( top, bottom, left, right, forward_cache ),
1964  zero_padding_2d_context{}.make_backward()( top, bottom, left, right, backward_cache ),
1965  "ZeroPadding2D"
1966  )( ex );
1967  };
1968  }
1969 
1970  namespace
1971  {
1972  struct repeat_context
1973  {
1974  auto make_forward() const noexcept
1975  {
1976  return []( unsigned long repeats, unsigned long axis, std::shared_ptr<std::any> forward_cache ) noexcept
1977  {
1978  return [=]<Tensor Tsor>( Tsor const& input ) noexcept
1979  {
1980  if ( 1UL == repeats ) return input;
1981  unsigned long const ax = std::min( axis, input.shape().size()-1 );
1982 
1983  auto const& shape = input.shape();
1984  unsigned long const stride = std::accumulate( shape.begin()+ax+1, shape.end(), 1UL, []( unsigned long x, unsigned long y ){ return x*y; } );
1985  unsigned long const iterations = std::accumulate( shape.begin(), shape.begin()+ax+1, 1UL, []( unsigned long x, unsigned long y ){ return x*y; } );
1986 
1987  // generate output tensor
1988  std::vector<unsigned long> output_shape = input.shape();
1989  output_shape[ax] *= repeats;
1990 
1991  Tsor& ans = context_cast<Tsor>( forward_cache );
1992  ans.resize( output_shape );
1993 
1994  // create 2D and 3D view
1995  view_2d v2{ input.data(), iterations, stride };
1996  view_3d v3{ ans.data(), iterations, repeats, stride };
1997 
1998  // copy data
1999  for ( auto it : range( iterations ) )
2000  for ( auto re : range( repeats ) )
2001  std::copy_n( v2[it], stride, v3[it][re] );
2002 
2003  return ans;
2004  };
2005  };
2006  }
2007 
2008  auto make_backward() const noexcept
2009  {
2010  return []( unsigned long repeats, unsigned long axis, std::shared_ptr<std::any> backward_cache ) noexcept
2011  {
2012  return [=]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2013  {
2014  if ( 1UL == repeats ) return grad;
2015  unsigned long const ax = std::min( axis, input.shape().size()-1 );
2016 
2017  auto const& shape = input.shape();
2018  unsigned long const stride = std::accumulate( shape.begin()+ax+1, shape.end(), 1UL, []( unsigned long x, unsigned long y ){ return x*y; } );
2019  unsigned long const iterations = std::accumulate( shape.begin(), shape.begin()+ax+1, 1UL, []( unsigned long x, unsigned long y ){ return x*y; } );
2020 
2021  Tsor& ans = context_cast<Tsor>( backward_cache );
2022  ans.resize( input.shape() );
2023  ans.reset();
2024 
2025  view_2d v2{ans.data(), iterations, stride };
2026  view_3d v3{ grad.data(), iterations, repeats, stride };
2027 
2028  for ( auto id : range( iterations ) )
2029  for ( auto re : range( repeats ) )
2030  for ( auto st : range( stride ) )
2031  v2[id][st] += v3[id][re][st];
2032 
2033  return ans;
2034  };
2035  };
2036  }
2037  };//struct repeat_context
2038  }//anonymous namespace
2039 
2040 
2055  inline auto repeat( unsigned long repeats, unsigned long axis=-1 ) noexcept
2056  {
2057  better_assert( repeats > 0, "repeat: repeats can not be zero." );
2058 
2059  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2060  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2061 
2062  return [repeats, axis, forward_cache, backward_cache]<Expression Ex>( Ex const& ex ) noexcept
2063  {
2064  return make_unary_operator
2065  (
2066  repeat_context{}.make_forward()( repeats, axis, forward_cache ),
2067  repeat_context{}.make_backward()( repeats, axis, backward_cache ),
2068  "Repeat"
2069  )
2070  ( ex );
2071  };
2072  }
2073 
2074 
2075  namespace
2076  {
2077  struct reduce_min_context
2078  {
2079  auto make_forward() const noexcept
2080  {
2081  return []( unsigned long axis, std::shared_ptr<std::any> forward_cache, std::shared_ptr<std::any> index_cache ) noexcept
2082  {
2083  return [=]<Tensor Tsor>( Tsor const& input ) noexcept
2084  {
2085  unsigned long const ax = std::min( axis, input.shape().size()-1 );
2086 
2087  // example: for an input tensor of shape ( 2, 3, 4, 5 ), and axis is 1
2088  auto const& shape = input.shape(); // example: the shape is ( 2, 3, 4, 5 )
2089  unsigned long const stride = std::accumulate( shape.begin()+ax+1, shape.end(), 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the stride is 20
2090  unsigned long const iterations = std::accumulate( shape.begin(), shape.begin()+ax, 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the iterations is 2
2091  unsigned long const scales = shape[ax]; // the elements in the dimenstion to reduce. example: scales is 3
2092 
2093  // generate output tensor
2094  std::vector<unsigned long> output_shape = input.shape(); // example: temporately being ( 2, 3, 4, 5 )
2095  std::copy( output_shape.begin()+ax+1, output_shape.end(), output_shape.begin()+ax ); // example: temporately being ( 2, 4, 5, 5 )
2096  output_shape.resize( output_shape.size() - 1 ); // example: output_shape is ( 2, 4, 5 )
2097 
2098  Tsor& ans = context_cast<Tsor>( forward_cache );
2099  ans.resize( output_shape ); // example: ans shape is ( 2, 4, 5 )
2100 
2101  tensor<unsigned long>& index = context_cast<tensor<unsigned long>>( index_cache );
2102  index.resize( output_shape ); // example: index shape is ( 2, 4, 5 )
2103 
2104  // create 2D and 3D view
2105  view_2d v2{ ans.data(), iterations, stride }; // example: viewing as a matrix of shape ( 2, 20 )
2106  view_2d v_index{ index.data(), iterations, stride }; // example: viewing as a matrix of ( 2, 20 )
2107  view_3d v3{ input.data(), iterations, scales, stride }; // example: viewing as a tube of ( 2, 3, 20 )
2108 
2109  // reduce minimal elements along the selected axis
2110  for ( auto it : range( iterations ) ) // example: range (2)
2111  for ( auto st : range( stride ) ) // example: range (20)
2112  {
2113  // reduce the minimal elements along the column of st
2114  auto min_itor = std::min_element( v3[it].col_begin(st), v3[it].col_end(st) );
2115  v2[it][st] = *min_itor;
2116 
2117  // record the minimal position offset with respect to the head of the column
2118  unsigned long const offset = std::distance( v3[it].col_begin(st), min_itor );
2119  v_index[it][st] = offset;
2120  }
2121 
2122  return ans;
2123  };
2124  };
2125  }
2126 
2127  auto make_backward() const noexcept
2128  {
2129  return []( unsigned long axis, std::shared_ptr<std::any> backward_cache, std::shared_ptr<std::any> index_cache ) noexcept
2130  {
2131  return [=]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2132  {
2133  unsigned long const ax = std::min( axis, input.shape().size()-1 );
2134 
2135  // example: for an input tensor of shape ( 2, 3, 4, 5 ), and axis is 1
2136  auto const& shape = input.shape(); // example: the shape is ( 2, 3, 4, 5 )
2137  unsigned long const stride = std::accumulate( shape.begin()+ax+1, shape.end(), 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the stride is 20
2138  unsigned long const iterations = std::accumulate( shape.begin(), shape.begin()+ax, 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the iterations is 2
2139  unsigned long const scales = shape[ax]; // the elements in the dimenstion to reduce. example: scales is 3
2140 
2141  std::vector<unsigned long> const& output_shape = grad.shape(); // example: output shape of ( 2, 4, 5 )
2142  tensor<unsigned long>& index = context_cast<tensor<unsigned long>>( index_cache );
2143  index.resize( output_shape ); // example: index shape is ( 2, 4, 5 )
2144 
2145  Tsor& ans = context_cast<Tsor>( backward_cache );
2146  ans.resize( shape ); // example: ans shape is ( 2, 3, 4, 5 )
2147  ans.reset();
2148 
2149  view_2d v_index{ index.data(), iterations, stride }; // example: viewing as a matrix of ( 2, 20 )
2150  view_3d v3{ ans.data(), iterations, scales, stride }; // example: view as a cube of ( 2, 3, 20 )
2151  view_2d v2{ grad.data(), iterations, stride }; // example: viewing as a matrix of ( 2, 20 )
2152 
2153  for ( auto it : range( iterations ) ) // example: range( 2 )
2154  for ( auto st : range( stride ) ) // example: range( 20 )
2155  {
2156  unsigned long const offset = v_index[it][st]; // get the offset from record
2157  v3[it][offset][st] = v2[it][st]; // only the element at the minimal position has gradient back-propagated
2158  }
2159 
2160  return ans;
2161  };
2162  };
2163  }
2164  };//struct reduce_min_context
2165  }//anonymous namespace
2166 
2167 
2181  inline auto reduce_min( unsigned long axis=-1 ) noexcept
2182  {
2183  std::shared_ptr<std::any> index_cache = std::make_shared<std::any>();
2184  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2185  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2186 
2187  return [axis, index_cache, forward_cache, backward_cache]<Expression Ex>( Ex const& ex ) noexcept
2188  {
2189  return make_unary_operator
2190  (
2191  reduce_min_context{}.make_forward()( axis, forward_cache, index_cache ),
2192  reduce_min_context{}.make_backward()( axis, backward_cache, index_cache ),
2193  "ReduceMin"
2194  )
2195  ( ex );
2196  };
2197  }
2198 
2199 
2200 
2201  namespace
2202  {
2203  struct reduce_max_context
2204  {
2205  auto make_forward() const noexcept
2206  {
2207  return []( unsigned long axis, std::shared_ptr<std::any> forward_cache, std::shared_ptr<std::any> index_cache ) noexcept
2208  {
2209  return [=]<Tensor Tsor>( Tsor const& input ) noexcept
2210  {
2211  unsigned long const ax = std::min( axis, input.shape().size()-1 );
2212 
2213  // example: for an input tensor of shape ( 2, 3, 4, 5 ), and axis is 1
2214  auto const& shape = input.shape(); // example: the shape is ( 2, 3, 4, 5 )
2215  unsigned long const stride = std::accumulate( shape.begin()+ax+1, shape.end(), 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the stride is 20
2216  unsigned long const iterations = std::accumulate( shape.begin(), shape.begin()+ax, 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the iterations is 2
2217  unsigned long const scales = shape[ax]; // the elements in the dimenstion to reduce. example: scales is 3
2218 
2219  // generate output tensor
2220  std::vector<unsigned long> output_shape = input.shape(); // example: temporately being ( 2, 3, 4, 5 )
2221  std::copy( output_shape.begin()+ax+1, output_shape.end(), output_shape.begin()+ax ); // example: temporately being ( 2, 4, 5, 5 )
2222  output_shape.resize( output_shape.size() - 1 ); // example: output_shape is ( 2, 4, 5 )
2223 
2224  Tsor& ans = context_cast<Tsor>( forward_cache );
2225  ans.resize( output_shape ); // example: ans shape is ( 2, 4, 5 )
2226 
2227  tensor<unsigned long>& index = context_cast<tensor<unsigned long>>( index_cache );
2228  index.resize( output_shape ); // example: index shape is ( 2, 4, 5 )
2229 
2230  // create 2D and 3D view
2231  view_2d v2{ ans.data(), iterations, stride }; // example: viewing as a matrix of shape ( 2, 20 )
2232  view_2d v_index{ index.data(), iterations, stride }; // example: viewing as a matrix of ( 2, 20 )
2233  view_3d v3{ input.data(), iterations, scales, stride }; // example: viewing as a tube of ( 2, 3, 20 )
2234 
2235  // reduce maximal elements along the selected axis
2236  for ( auto it : range( iterations ) ) // example: range (2)
2237  for ( auto st : range( stride ) ) // example: range (20)
2238  {
2239  // reduce the maximal elements along the column of st
2240  auto max_itor = std::max_element( v3[it].col_begin(st), v3[it].col_end(st) );
2241  v2[it][st] = *max_itor;
2242 
2243  // record the maximal position offset with respect to the head of the column
2244  unsigned long const offset = std::distance( v3[it].col_begin(st), max_itor );
2245  v_index[it][st] = offset;
2246  }
2247 
2248  return ans;
2249  };
2250  };
2251  }
2252 
2253  auto make_backward() const noexcept
2254  {
2255  return []( unsigned long axis, std::shared_ptr<std::any> backward_cache, std::shared_ptr<std::any> index_cache ) noexcept
2256  {
2257  return [=]<Tensor Tsor>( Tsor const& input, Tsor const& , Tsor const& grad ) noexcept
2258  {
2259  unsigned long const ax = std::min( axis, input.shape().size()-1 );
2260 
2261  // example: for an input tensor of shape ( 2, 3, 4, 5 ), and axis is 1
2262  auto const& shape = input.shape(); // example: the shape is ( 2, 3, 4, 5 )
2263  unsigned long const stride = std::accumulate( shape.begin()+ax+1, shape.end(), 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the stride is 20
2264  unsigned long const iterations = std::accumulate( shape.begin(), shape.begin()+ax, 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the iterations is 2
2265  unsigned long const scales = shape[ax]; // the elements in the dimenstion to reduce. example: scales is 3
2266 
2267  std::vector<unsigned long> const& output_shape = grad.shape(); // example: output shape of ( 2, 4, 5 )
2268  tensor<unsigned long>& index = context_cast<tensor<unsigned long>>( index_cache );
2269  index.resize( output_shape ); // example: index shape is ( 2, 4, 5 )
2270 
2271  Tsor& ans = context_cast<Tsor>( backward_cache );
2272  ans.resize( shape ); // example: ans shape is ( 2, 3, 4, 5 )
2273  ans.reset();
2274 
2275  view_2d v_index{ index.data(), iterations, stride }; // example: viewing as a matrix of ( 2, 20 )
2276  view_3d v3{ ans.data(), iterations, scales, stride }; // example: view as a cube of ( 2, 3, 20 )
2277  view_2d v2{ grad.data(), iterations, stride }; // example: viewing as a matrix of ( 2, 20 )
2278 
2279  for ( auto it : range( iterations ) ) // example: range( 2 )
2280  for ( auto st : range( stride ) ) // example: range( 20 )
2281  {
2282  unsigned long const offset = v_index[it][st]; // get the offset from record
2283  v3[it][offset][st] = v2[it][st]; // only the element at the maximal position has gradient back-propagated
2284  }
2285 
2286  return ans;
2287  };
2288  };
2289  }
2290  };//struct reduce_max_context
2291  }//anonymous namespace
2292 
2293 
2307  inline auto reduce_max( unsigned long axis=-1 ) noexcept
2308  {
2309  std::shared_ptr<std::any> index_cache = std::make_shared<std::any>();
2310  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2311  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2312 
2313  return [axis, index_cache, forward_cache, backward_cache]<Expression Ex>( Ex const& ex ) noexcept
2314  {
2315  return make_unary_operator
2316  (
2317  reduce_max_context{}.make_forward()( axis, forward_cache, index_cache ),
2318  reduce_max_context{}.make_backward()( axis, backward_cache, index_cache ),
2319  "ReduceMax"
2320  )
2321  ( ex );
2322  };
2323  }
2324 
2325 
2326 
2327  namespace
2328  {
2329  struct reduce_sum_context
2330  {
2331  auto make_forward() const noexcept
2332  {
2333  return []( unsigned long axis, std::shared_ptr<std::any> forward_cache ) noexcept
2334  {
2335  return [=]<Tensor Tsor>( Tsor const& input ) noexcept
2336  {
2337  typedef typename Tsor::value_type value_type;
2338 
2339  unsigned long const ax = std::min( axis, input.shape().size()-1 );
2340 
2341  // example: for an input tensor of shape ( 2, 3, 4, 5 ), and axis is 1
2342  auto const& shape = input.shape(); // example: the shape is ( 2, 3, 4, 5 )
2343  unsigned long const stride = std::accumulate( shape.begin()+ax+1, shape.end(), 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the stride is 20
2344  unsigned long const iterations = std::accumulate( shape.begin(), shape.begin()+ax, 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the iterations is 2
2345  unsigned long const scales = shape[ax]; // the elements in the dimenstion to reduce. example: scales is 3
2346 
2347  // generate output tensor
2348  std::vector<unsigned long> output_shape = input.shape(); // example: temporately being ( 2, 3, 4, 5 )
2349  std::copy( output_shape.begin()+ax+1, output_shape.end(), output_shape.begin()+ax ); // example: temporately being ( 2, 4, 5, 5 )
2350  output_shape.resize( output_shape.size() - 1 ); // example: output_shape is ( 2, 4, 5 )
2351 
2352  Tsor& ans = context_cast<Tsor>( forward_cache );
2353  ans.resize( output_shape ); // example: ans shape is ( 2, 4, 5 )
2354 
2355  // create 2D and 3D view
2356  view_2d v2{ ans.data(), iterations, stride }; // example: viewing as a matrix of shape ( 2, 20 )
2357  view_3d v3{ input.data(), iterations, scales, stride }; // example: viewing as a tube of ( 2, 3, 20 )
2358 
2359  // reduce sum along the selected axis
2360  for ( auto it : range( iterations ) ) // example: range (2)
2361  for ( auto st : range( stride ) ) // example: range (20)
2362  v2[it][st] = std::accumulate( v3[it].col_begin(st), v3[it].col_end(st), value_type{0} );
2363 
2364  return ans;
2365  };
2366  };
2367  }
2368 
2369  auto make_backward() const noexcept
2370  {
2371  return []( unsigned long axis, std::shared_ptr<std::any> backward_cache ) noexcept
2372  {
2373  return [=]<Tensor Tsor>( Tsor const& input, Tsor const& , Tsor const& grad ) noexcept
2374  {
2375  unsigned long const ax = std::min( axis, input.shape().size()-1 );
2376 
2377  // example: for an input tensor of shape ( 2, 3, 4, 5 ), and axis is 1
2378  auto const& shape = input.shape(); // example: the shape is ( 2, 3, 4, 5 )
2379  unsigned long const stride = std::accumulate( shape.begin()+ax+1, shape.end(), 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the stride is 20
2380  unsigned long const iterations = std::accumulate( shape.begin(), shape.begin()+ax, 1UL, []( unsigned long x, unsigned long y ){ return x*y; } ); // example: the iterations is 2
2381  unsigned long const scales = shape[ax]; // the elements in the dimenstion to reduce. example: scales is 3
2382 
2383  Tsor& ans = context_cast<Tsor>( backward_cache );
2384  ans.resize( shape ); // example: ans shape is ( 2, 3, 4, 5 )
2385  ans.reset();
2386 
2387  view_3d v3{ ans.data(), iterations, scales, stride }; // example: view as a cube of ( 2, 3, 20 )
2388  view_2d v2{ grad.data(), iterations, stride }; // example: viewing as a matrix of ( 2, 20 )
2389 
2390  for ( auto it : range( iterations ) ) // example: range( 2 )
2391  for ( auto st : range( stride ) ) // example: range( 20 )
2392  std::fill( v3[it].col_begin( st ), v3[it].col_end( st ), v2[it][st] );
2393 
2394  return ans;
2395  };
2396  };
2397  }
2398  };//struct reduce_sum_context
2399  }//anonymous namespace
2400 
2401 
2415  inline auto reduce_sum( unsigned long axis ) noexcept
2416  {
2417  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2418  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2419 
2420  return [axis, forward_cache, backward_cache]<Expression Ex>( Ex const& ex ) noexcept
2421  {
2422  return make_unary_operator
2423  (
2424  reduce_sum_context{}.make_forward()( axis, forward_cache ),
2425  reduce_sum_context{}.make_backward()( axis, backward_cache ),
2426  "ReduceSum"
2427  )
2428  ( ex );
2429  };
2430  }
2431 
2432 
2433 
2434 
2435 
2436 
2446  template <Expression Ex>
2447  auto constexpr abs( Ex const& ex ) noexcept
2448  {
2449  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2450  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2451  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2452  {
2453  Tsor& ans = context_cast<Tsor>( forward_cache );
2454  ans.resize( input.shape() );
2455  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::abs(x); } );
2456  return ans;
2457  },
2458  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2459  {
2460  Tsor& ans = context_cast<Tsor>( backward_cache );
2461  ans.resize( input.shape() );
2462  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g * ((x > 0.0) ? 1.0 : ((x < 0.0) ? -1.0 : 0.0)); } );
2463  return ans;
2464  },
2465  "Abs"
2466  )( ex );
2467  };
2468 
2469 
2470 
2471 
2472 
2473 
2483  template <Expression Ex>
2484  auto constexpr acos( Ex const& ex ) noexcept
2485  {
2486  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2487  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2488  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2489  {
2490  Tsor& ans = context_cast<Tsor>( forward_cache );
2491  ans.resize( input.shape() );
2492  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::acos(x); } );
2493  return ans;
2494  },
2495  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2496  {
2497  Tsor& ans = context_cast<Tsor>( backward_cache );
2498  ans.resize( input.shape() );
2499  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = - g / std::sqrt(1.0-x*x); } );
2500  return ans;
2501  },
2502  "Acos"
2503  )( ex );
2504  };
2505 
2506 
2507 
2508 
2509 
2510 
2520  template <Expression Ex>
2521  auto constexpr acosh( Ex const& ex ) noexcept
2522  {
2523  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2524  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2525  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2526  {
2527  Tsor& ans = context_cast<Tsor>( forward_cache );
2528  ans.resize( input.shape() );
2529  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::acosh(x); } );
2530  return ans;
2531  },
2532  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2533  {
2534  Tsor& ans = context_cast<Tsor>( backward_cache );
2535  ans.resize( input.shape() );
2536  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g / std::sqrt(x*x-1.0); } );
2537  return ans;
2538  },
2539  "Acosh"
2540  )( ex );
2541  };
2542 
2543 
2544 
2545 
2546 
2547 
2557  template <Expression Ex>
2558  auto constexpr asin( Ex const& ex ) noexcept
2559  {
2560  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2561  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2562  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2563  {
2564  Tsor& ans = context_cast<Tsor>( forward_cache );
2565  ans.resize( input.shape() );
2566  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::asin(x); } );
2567  return ans;
2568  },
2569  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2570  {
2571  Tsor& ans = context_cast<Tsor>( backward_cache );
2572  ans.resize( input.shape() );
2573  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g / std::sqrt(1.0-x*x); } );
2574  return ans;
2575  },
2576  "Asin"
2577  )( ex );
2578  };
2579 
2580 
2581 
2582 
2583 
2584 
2594  template <Expression Ex>
2595  auto constexpr asinh( Ex const& ex ) noexcept
2596  {
2597  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2598  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2599  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2600  {
2601  Tsor& ans = context_cast<Tsor>( forward_cache );
2602  ans.resize( input.shape() );
2603  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::asinh(x); } );
2604  return ans;
2605  },
2606  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2607  {
2608  Tsor& ans = context_cast<Tsor>( backward_cache );
2609  ans.resize( input.shape() );
2610  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g / std::sqrt(1.0+x*x); } );
2611  return ans;
2612  },
2613  "Asinh"
2614  )( ex );
2615  };
2616 
2617 
2618 
2619 
2620 
2621 
2631  template <Expression Ex>
2632  auto constexpr atan( Ex const& ex ) noexcept
2633  {
2634  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2635  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2636  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2637  {
2638  Tsor& ans = context_cast<Tsor>( forward_cache );
2639  ans.resize( input.shape() );
2640  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::atan(x); } );
2641  return ans;
2642  },
2643  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2644  {
2645  Tsor& ans = context_cast<Tsor>( backward_cache );
2646  ans.resize( input.shape() );
2647  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g / (1.0+x*x); } );
2648  return ans;
2649  },
2650  "Atan"
2651  )( ex );
2652  };
2653 
2654 
2655 
2656 
2657 
2658 
2668  template <Expression Ex>
2669  auto constexpr atanh( Ex const& ex ) noexcept
2670  {
2671  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2672  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2673  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2674  {
2675  Tsor& ans = context_cast<Tsor>( forward_cache );
2676  ans.resize( input.shape() );
2677  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::atanh(x); } );
2678  return ans;
2679  },
2680  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2681  {
2682  Tsor& ans = context_cast<Tsor>( backward_cache );
2683  ans.resize( input.shape() );
2684  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g / (1-x*x); } );
2685  return ans;
2686  },
2687  "Atanh"
2688  )( ex );
2689  };
2690 
2691 
2692 
2693 
2694 
2695 
2705  template <Expression Ex>
2706  auto constexpr cbrt( Ex const& ex ) noexcept
2707  {
2708  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2709  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2710  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2711  {
2712  Tsor& ans = context_cast<Tsor>( forward_cache );
2713  ans.resize( input.shape() );
2714  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::cbrt(x); } );
2715  return ans;
2716  },
2717  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const& output, Tsor const& grad ) noexcept
2718  {
2719  Tsor& ans = context_cast<Tsor>( backward_cache );
2720  ans.resize( input.shape() );
2721  for_each( input.begin(), input.end(), output.begin(), grad.begin(), ans.begin(), []( auto, auto o, auto g, auto& v ) noexcept { v = g / (3.0*o*o); } );
2722  return ans;
2723  },
2724  "Cbert"
2725  )( ex );
2726  };
2727 
2728 
2729 
2730 
2731 
2732 
2742  template <Expression Ex>
2743  auto constexpr ceil( Ex const& ex ) noexcept
2744  {
2745  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2746  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2747  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2748  {
2749  Tsor& ans = context_cast<Tsor>( forward_cache );
2750  ans.resize( input.shape() );
2751  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::ceil(x); } );
2752  return ans;
2753  },
2754  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
2755  {
2756  return grad;
2757  },
2758  "Ceil"
2759  )( ex );
2760  };
2761 
2762 
2763 
2764 
2765 
2766 
2776  template <Expression Ex>
2777  auto constexpr cos( Ex const& ex ) noexcept
2778  {
2779  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2780  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2781  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2782  {
2783  Tsor& ans = context_cast<Tsor>( forward_cache );
2784  ans.resize( input.shape() );
2785  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::cos(x); } );
2786  return ans;
2787  },
2788  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2789  {
2790  Tsor& ans = context_cast<Tsor>( backward_cache );
2791  ans.resize( input.shape() );
2792  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = - g * std::sin(x); } );
2793  return ans;
2794  },
2795  "Cos"
2796  )( ex );
2797  };
2798 
2799 
2800 
2801 
2802 
2803 
2813  template <Expression Ex>
2814  auto constexpr cosh( Ex const& ex ) noexcept
2815  {
2816  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2817  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2818  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2819  {
2820  Tsor& ans = context_cast<Tsor>( forward_cache );
2821  ans.resize( input.shape() );
2822  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::cosh(x); } );
2823  return ans;
2824  },
2825  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2826  {
2827  Tsor& ans = context_cast<Tsor>( backward_cache );
2828  ans.resize( input.shape() );
2829  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g * std::sinh(x); } );
2830  return ans;
2831  },
2832  "Cosh"
2833  )( ex );
2834  };
2835 
2836 
2837 
2838 
2839 
2840 
2850  template <Expression Ex>
2851  auto constexpr erf( Ex const& ex ) noexcept
2852  {
2853  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2854  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2855  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2856  {
2857  Tsor& ans = context_cast<Tsor>( forward_cache );
2858  ans.resize( input.shape() );
2859  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::erf(x); } );
2860  return ans;
2861  },
2862  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2863  {
2864  Tsor& ans = context_cast<Tsor>( backward_cache );
2865  ans.resize( input.shape() );
2866  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = typename Tsor::value_type{1.12837916709551257389} * g * std::exp(-x*x); } );
2867  return ans;
2868  },
2869  "Erf"
2870  )( ex );
2871  };
2872 
2873 
2874 
2875 
2876 
2877 
2887  template <Expression Ex>
2888  auto constexpr erfc( Ex const& ex ) noexcept
2889  {
2890 
2891  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2892  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2893  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2894  {
2895  Tsor& ans = context_cast<Tsor>( forward_cache );
2896  ans.resize( input.shape() );
2897  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::erfc(x); } );
2898  return ans;
2899  },
2900  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
2901  {
2902  Tsor& ans = context_cast<Tsor>( backward_cache );
2903  ans.resize( input.shape() );
2904  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = typename Tsor::value_type{-1.12837916709551257389} * g * std::exp(-x*x); } );
2905  return ans;
2906  },
2907  "Erfc"
2908  )( ex );
2909  };
2910 
2911 
2912 
2913 
2914 
2915 
2925  template <Expression Ex>
2926  auto constexpr exp( Ex const& ex ) noexcept
2927  {
2928  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2929  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2930  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2931  {
2932  Tsor& ans = context_cast<Tsor>( forward_cache );
2933  ans.resize( input.shape() );
2934  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::exp(x); } );
2935  return ans;
2936  },
2937  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const& output, Tsor const& grad ) noexcept
2938  {
2939  Tsor& ans = context_cast<Tsor>( backward_cache );
2940  ans.resize( input.shape() );
2941  for_each( input.begin(), input.end(), output.begin(), grad.begin(), ans.begin(), []( auto, auto o, auto g, auto& v ) noexcept { v = g * o; } );
2942  return ans;
2943  },
2944  "Exp"
2945  )( ex );
2946  };
2947 
2948 
2949 
2950 
2951 
2952 
2962  template <Expression Ex>
2963  auto constexpr exp2( Ex const& ex ) noexcept
2964  {
2965  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
2966  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
2967  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
2968  {
2969  Tsor& ans = context_cast<Tsor>( forward_cache );
2970  ans.resize( input.shape() );
2971  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::exp2(x); } );
2972  return ans;
2973  },
2974  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const& output, Tsor const& grad ) noexcept
2975  {
2976  Tsor& ans = context_cast<Tsor>( backward_cache );
2977  ans.resize( input.shape() );
2978  for_each( input.begin(), input.end(), output.begin(), grad.begin(), ans.begin(), []( auto, auto o, auto g, auto& v ) noexcept { v = std::log(2.0) * g * o; } );
2979  return ans;
2980  },
2981  "Exp2"
2982  )( ex );
2983  };
2984 
2985 
2986 
2987 
2988 
2989 
2999  template <Expression Ex>
3000  auto constexpr expm1( Ex const& ex ) noexcept
3001  {
3002  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3003  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3004  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3005  {
3006  Tsor& ans = context_cast<Tsor>( forward_cache );
3007  ans.resize( input.shape() );
3008  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::expm1(x); } );
3009  return ans;
3010  },
3011  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const& output, Tsor const& grad ) noexcept
3012  {
3013  Tsor& ans = context_cast<Tsor>( backward_cache );
3014  ans.resize( input.shape() );
3015  for_each( input.begin(), input.end(), output.begin(), grad.begin(), ans.begin(), []( auto, auto o, auto g, auto& v ) noexcept { v = g * (o+1.0); } );
3016  return ans;
3017  },
3018  "Expm1"
3019  )( ex );
3020  };
3021 
3022 
3023 
3024 
3025 
3026 
3036  template <Expression Ex>
3037  auto constexpr fabs( Ex const& ex ) noexcept
3038  {
3039  return abs( ex );
3040  };
3041 
3042 
3043 
3044 
3045 
3046 
3056  template <Expression Ex>
3057  auto constexpr floor( Ex const& ex ) noexcept
3058  {
3059  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3060  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3061  {
3062  Tsor& ans = context_cast<Tsor>( forward_cache );
3063  ans.resize( input.shape() );
3064  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::floor(x); } );
3065  return ans;
3066  },
3067  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
3068  {
3069  return grad;
3070  },
3071  "Floor"
3072  )( ex );
3073  };
3074 
3075 
3076 
3077 
3078 
3079 #if 0
3080 
3090  template <Expression Ex>
3091  auto constexpr ilogb( Ex const& ex ) noexcept
3092  {
3093  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3094  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3095  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3096  {
3097  Tsor& ans = context_cast<Tsor>( forward_cache );
3098  ans.resize( input.shape() );
3099  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::ilogb(x); } );
3100  return ans;
3101  },
3102  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
3103  {
3104  Tsor& ans = context_cast<Tsor>( backward_cache );
3105  ans.resize( input.shape() );
3106  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g * std::FIXME(x); } );
3107  return ans;
3108  },
3109  "Ilogb"
3110  )( ex );
3111  };
3112 
3113 #endif
3114 
3115 
3116 
3117 #if 0
3127  template <Expression Ex>
3128  auto constexpr lgamma( Ex const& ex ) noexcept
3129  {
3130  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3131  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3132  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3133  {
3134  Tsor& ans = context_cast<Tsor>( forward_cache );
3135  ans.resize( input.shape() );
3136  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::lgamma(x); } );
3137  return ans;
3138  },
3139  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
3140  {
3141  Tsor& ans = context_cast<Tsor>( backward_cache );
3142  ans.resize( input.shape() );
3143  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g * std::FIXME(x); } );
3144  return ans;
3145  },
3146  "lgamma"
3147  )( ex );
3148  };
3149 #endif
3150 
3151 
3152 
3153 
3154 
3164  template <Expression Ex>
3165  auto constexpr llrint( Ex const& ex ) noexcept
3166  {
3167  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3168  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3169  {
3170  Tsor& ans = context_cast<Tsor>( forward_cache );
3171  ans.resize( input.shape() );
3172  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::llrint(x); } );
3173  return ans;
3174  },
3175  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
3176  {
3177  return grad;
3178  },
3179  "Llrint"
3180  )( ex );
3181  };
3182 
3183 
3184 
3185 
3186 
3187 
3197  template <Expression Ex>
3198  auto constexpr llround( Ex const& ex ) noexcept
3199  {
3200  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3201  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3202  {
3203  Tsor& ans = context_cast<Tsor>( forward_cache );
3204  ans.resize( input.shape() );
3205  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::llround(x); } );
3206  return ans;
3207  },
3208  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
3209  {
3210  return grad;
3211  },
3212  "Llround"
3213  )( ex );
3214  };
3215 
3216 
3217 
3218 
3219 
3220 
3230  template <Expression Ex>
3231  auto constexpr log( Ex const& ex ) noexcept
3232  {
3233  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3234  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3235  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3236  {
3237  Tsor& ans = context_cast<Tsor>( forward_cache );
3238  ans.resize( input.shape() );
3239  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::log(x); } );
3240  return ans;
3241  },
3242  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
3243  {
3244  Tsor& ans = context_cast<Tsor>( backward_cache );
3245  ans.resize( input.shape() );
3246  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g / x; } );
3247  return ans;
3248  },
3249  "Log"
3250  )( ex );
3251  };
3252 
3253 
3254 
3255 
3256 
3257 
3267  template <Expression Ex>
3268  auto constexpr log10( Ex const& ex ) noexcept
3269  {
3270  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3271  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3272  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3273  {
3274  Tsor& ans = context_cast<Tsor>( forward_cache );
3275  ans.resize( input.shape() );
3276  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::log10(x); } );
3277  return ans;
3278  },
3279  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
3280  {
3281  Tsor& ans = context_cast<Tsor>( backward_cache );
3282  ans.resize( input.shape() );
3283  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g / (2.30258509299404568402*x); } );
3284  return ans;
3285  },
3286  "Log10"
3287  )( ex );
3288  };
3289 
3290 
3291 
3292 
3293 
3294 
3304  template <Expression Ex>
3305  auto constexpr log1p( Ex const& ex ) noexcept
3306  {
3307  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3308  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3309  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3310  {
3311  Tsor& ans = context_cast<Tsor>( forward_cache );
3312  ans.resize( input.shape() );
3313  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::log1p(x); } );
3314  return ans;
3315  },
3316  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
3317  {
3318  Tsor& ans = context_cast<Tsor>( backward_cache );
3319  ans.resize( input.shape() );
3320  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g / x; } );
3321  return ans;
3322  },
3323  "Log1p"
3324  )( ex );
3325  };
3326 
3327 
3328 
3329 
3330 
3331 
3341  template <Expression Ex>
3342  auto constexpr log2( Ex const& ex ) noexcept
3343  {
3344  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3345  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3346  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3347  {
3348  Tsor& ans = context_cast<Tsor>( forward_cache );
3349  ans.resize( input.shape() );
3350  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::log2(x); } );
3351  return ans;
3352  },
3353  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
3354  {
3355  Tsor& ans = context_cast<Tsor>( backward_cache );
3356  ans.resize( input.shape() );
3357  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g / (0.69314718055994530942*x); } );
3358  return ans;
3359  },
3360  "Log2"
3361  )( ex );
3362  };
3363 
3364 
3365 
3366 
3367 
3368 
3369 #if 0
3379  template <Expression Ex>
3380  auto constexpr logb( Ex const& ex ) noexcept
3381  {
3382  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3383  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3384  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3385  {
3386  Tsor& ans = context_cast<Tsor>( forward_cache );
3387  ans.resize( input.shape() );
3388  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::logb(x); } );
3389  return ans;
3390  },
3391  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
3392  {
3393  Tsor& ans = context_cast<Tsor>( backward_cache );
3394  ans.resize( input.shape() );
3395  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g * std::FIXME(x); } );
3396  return ans;
3397  },
3398  "Logb"
3399  )( ex );
3400  };
3401 #endif
3402 
3403 
3404 
3405 
3406 
3416  template <Expression Ex>
3417  auto constexpr lrint( Ex const& ex ) noexcept
3418  {
3419  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3420  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3421  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3422  {
3423  Tsor& ans = context_cast<Tsor>( forward_cache );
3424  ans.resize( input.shape() );
3425  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::lrint(x); } );
3426  return ans;
3427  },
3428  [backward_cache]<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
3429  {
3430  return grad;
3431  },
3432  "Lrint"
3433  )( ex );
3434  };
3435 
3436 
3437 
3438 
3439 
3440 
3450  template <Expression Ex>
3451  auto constexpr lround( Ex const& ex ) noexcept
3452  {
3453  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3454  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3455  {
3456  Tsor& ans = context_cast<Tsor>( forward_cache );
3457  ans.resize( input.shape() );
3458  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::lround(x); } );
3459  return ans;
3460  },
3461  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
3462  {
3463  return grad;
3464  },
3465  "Lround"
3466  )( ex );
3467  };
3468 
3469 
3470 
3471 
3472 
3473 
3483  template <Expression Ex>
3484  auto constexpr nearbyint( Ex const& ex ) noexcept
3485  {
3486  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3487  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3488  {
3489  Tsor& ans = context_cast<Tsor>( forward_cache );
3490  ans.resize( input.shape() );
3491  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::nearbyint(x); } );
3492  return ans;
3493  },
3494  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
3495  {
3496  return grad;
3497  },
3498  "Nearbyint"
3499  )( ex );
3500  };
3501 
3502 
3503 
3504 
3505 
3506 
3516  template <Expression Ex>
3517  auto constexpr rint( Ex const& ex ) noexcept
3518  {
3519  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3520  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3521  {
3522  Tsor& ans = context_cast<Tsor>( forward_cache );
3523  ans.resize( input.shape() );
3524  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::rint(x); } );
3525  return ans;
3526  },
3527  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
3528  {
3529  return grad;
3530  },
3531  "Rint"
3532  )( ex );
3533  };
3534 
3535 
3536 
3537 
3538 
3539 
3549  template <Expression Ex>
3550  auto constexpr round( Ex const& ex ) noexcept
3551  {
3552  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3553  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3554  {
3555  Tsor& ans = context_cast<Tsor>( forward_cache );
3556  ans.resize( input.shape() );
3557  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::round(x); } );
3558  return ans;
3559  },
3560  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
3561  {
3562  return grad;
3563  },
3564  "Round"
3565  )( ex );
3566  };
3567 
3568 
3569 
3570 
3571 
3572 
3582  template <Expression Ex>
3583  auto constexpr sin( Ex const& ex ) noexcept
3584  {
3585  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3586  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3587  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3588  {
3589  Tsor& ans = context_cast<Tsor>( forward_cache );
3590  ans.resize( input.shape() );
3591  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::sin(x); } );
3592  return ans;
3593  },
3594  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
3595  {
3596  Tsor& ans = context_cast<Tsor>( backward_cache );
3597  ans.resize( input.shape() );
3598  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g * std::cos(x); } );
3599  return ans;
3600  },
3601  "Sin"
3602  )( ex );
3603  };
3604 
3605 
3606 
3607 
3608 
3609 
3619  template <Expression Ex>
3620  auto constexpr sinh( Ex const& ex ) noexcept
3621  {
3622  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3623  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3624  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3625  {
3626  Tsor& ans = context_cast<Tsor>( forward_cache );
3627  ans.resize( input.shape() );
3628  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::sinh(x); } );
3629  return ans;
3630  },
3631  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
3632  {
3633  Tsor& ans = context_cast<Tsor>( backward_cache );
3634  ans.resize( input.shape() );
3635  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g * std::cosh(x); } );
3636  return ans;
3637  },
3638  "Sinh"
3639  )( ex );
3640  };
3641 
3642 
3643 
3644 
3645 
3646 
3656  template <Expression Ex>
3657  auto constexpr sqrt( Ex const& ex ) noexcept
3658  {
3659  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3660  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3661  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3662  {
3663  Tsor& ans = context_cast<Tsor>( forward_cache );
3664  ans.resize( input.shape() );
3665  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::sqrt(x); } );
3666  return ans;
3667  },
3668  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const& output, Tsor const& grad ) noexcept
3669  {
3670  Tsor& ans = context_cast<Tsor>( backward_cache );
3671  ans.resize( input.shape() );
3672  for_each( input.begin(), input.end(), output.begin(), grad.begin(), ans.begin(), []( auto, auto o, auto g, auto& v ) noexcept { v = g / (o+o); } );
3673  return ans;
3674  },
3675  "Sqrt"
3676  )( ex );
3677  };
3678 
3679 
3680 
3681 
3682 
3683 
3693  template <Expression Ex>
3694  auto constexpr tan( Ex const& ex ) noexcept
3695  {
3696  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3697  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3698  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3699  {
3700  Tsor& ans = context_cast<Tsor>( forward_cache );
3701  ans.resize( input.shape() );
3702  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::tan(x); } );
3703  return ans;
3704  },
3705  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const& output, Tsor const& grad ) noexcept
3706  {
3707  Tsor& ans = context_cast<Tsor>( backward_cache );
3708  ans.resize( input.shape() );
3709  for_each( input.begin(), input.end(), output.begin(), grad.begin(), ans.begin(), []( auto x, auto o, auto g, auto& v ) noexcept { v = g * (1.0+o*o); } );
3710  return ans;
3711  },
3712  "Tan"
3713  )( ex );
3714  };
3715 
3716 
3717 
3718 
3719 
3720 
3730  template <Expression Ex>
3731  auto constexpr tanh( Ex const& ex ) noexcept
3732  {
3733  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3734  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3735  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3736  {
3737  Tsor& ans = context_cast<Tsor>( forward_cache );
3738  ans.resize( input.shape() );
3739  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::tanh(x); } );
3740  return ans;
3741  },
3742  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const& output, Tsor const& grad ) noexcept
3743  {
3744  Tsor& ans = context_cast<Tsor>( backward_cache );
3745  ans.resize( input.shape() );
3746  for_each( input.begin(), input.end(), output.begin(), grad.begin(), ans.begin(), []( auto, auto o, auto g, auto& v ) noexcept { v = g * (1.0-o*o); } );
3747  return ans;
3748  },
3749  "Tanh"
3750  )( ex );
3751  };
3752 
3753 
3754 
3755 
3756 
3757 #if 0
3758 
3768  template <Expression Ex>
3769  auto constexpr tgamma( Ex const& ex ) noexcept
3770  {
3771  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3772  std::shared_ptr<std::any> backward_cache = std::make_shared<std::any>();
3773  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3774  {
3775  Tsor& ans = context_cast<Tsor>( forward_cache );
3776  ans.resize( input.shape() );
3777  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::tgamma(x); } );
3778  return ans;
3779  },
3780  [backward_cache]<Tensor Tsor>( Tsor const& input, Tsor const&, Tsor const& grad ) noexcept
3781  {
3782  Tsor& ans = context_cast<Tsor>( backward_cache );
3783  ans.resize( input.shape() );
3784  for_each( input.begin(), input.end(), grad.begin(), ans.begin(), []( auto x, auto g, auto& v ) noexcept { v = g * std::FIXME(x); } );
3785  return ans;
3786  },
3787  "Tgamma"
3788  )( ex );
3789  };
3790 #endif
3791 
3792 
3793 
3794 
3795 
3805  template <Expression Ex>
3806  auto constexpr trunc( Ex const& ex ) noexcept
3807  {
3808  std::shared_ptr<std::any> forward_cache = std::make_shared<std::any>();
3809  return make_unary_operator( [forward_cache]<Tensor Tsor>( Tsor const& input ) noexcept
3810  {
3811  Tsor& ans = context_cast<Tsor>( forward_cache );
3812  ans.resize( input.shape() );
3813  for_each( input.begin(), input.end(), ans.begin(), []( auto x, auto& v ) noexcept { v = std::trunc(x); } );
3814  return ans;
3815  },
3816  []<Tensor Tsor>( Tsor const&, Tsor const&, Tsor const& grad ) noexcept
3817  {
3818  return grad;
3819  },
3820  "Trunc"
3821  )( ex );
3822  };
3823 
3824 
3825 #if 0
3826 
3839  template< Variable Lhs_Expression, Expression Rhs_Expression >
3840  auto constexpr assign( Lhs_Expression const& lhs_ex, Rhs_Expression const& rhs_ex ) noexcept
3841  {
3842  return make_binary_operator( []<Tensor Tsor>( Tsor& lhs_tensor, Tsor const& rhs_tensor ) noexcept // well, lhs_tensor can be 'Tensor&'
3843  {
3844  lhs_tensor.reshape( rhs_tensor.shape() );
3845  std::copy( rhs_tensor.begin(), rhs_tensor.end(), lhs_tensor.begin() );
3846  return lhs_tensor;
3847  },
3848  []<Tensor Tsor>( Tsor const& lhs_input, Tsor const& rhs_input, Tsor const&, Tsor const& ) noexcept
3849  {
3850  return std::make_tuple( zeros_like( lhs_input ), zeros_like( rhs_input ) );
3851  },
3852  "Assign"
3853  )( lhs_ex, rhs_ex );
3854  };
3855 
3856 #endif
3857 
3858 
3859 
3860 }//namespace ceras
3861 
3862 #endif//IPKVWSJOCMGGVRASCBLPYHFBCHRIVEXYBOMMDAKFAUDFYVYOOOISLRXJNUJKPJEVMLDPRDSNM
3863 
Definition: activation.hpp:12
auto min(Tsor const &tsor)
Definition: tensor.hpp:1026
static constexpr auto make_binary_operator
Definition: operation.hpp:108
constexpr auto plus(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex) noexcept
Definition: operation.hpp:285
requires std::floating_point< typename Tsor::value_type > Tsor variance(Tsor const &ts, unsigned long axis, bool keepdims=false) noexcept
Definition: tensor.hpp:1163
requires std::floating_point< T > void gemm(T const *A, bool a_transposed, T const *B, bool b_transposed, unsigned long m, unsigned long n, unsigned long k, T *C)
Definition: tensor.hpp:553
constexpr Tsor ones_like(Tsor const &tsor)
Definition: tensor.hpp:1002
constexpr auto sum_reduce(Ex const &ex) noexcept
Definition: operation.hpp:450
constexpr bool is_binary_operator_v
Definition: operation.hpp:148
Tsor add(Tsor const &lhs, Tsor const &rhs) noexcept
Definition: tensor.hpp:604
std::string computation_graph(Ex const &ex) noexcept
Definition: operation.hpp:178
concept Operator
A type that represents an unary or a binary operator.
Definition: operation.hpp:162
concept Unary_Operator
A type that represents an unary operator.
Definition: operation.hpp:135
void multiply(Tsor const &lhs, Tsor const &rhs, Tsor &ans) noexcept
Definition: tensor.hpp:699
constexpr auto square(Ex const &ex) noexcept
Definition: operation.hpp:563
concept Binary_Operator
A type that represents a binary operator.
Definition: operation.hpp:155
bool has_nan(Tsor const &tsor)
Definition: tensor.hpp:1095
constexpr auto reduce_sum(Ex const &ex) noexcept
Definition: operation.hpp:470
concept Tensor
Definition: tensor.hpp:362
auto operator+(C const &c) noexcept
Returns the complex expression.
Definition: complex_operator.hpp:154
Tsor elementwise_divide(Tsor const &lhs, Tsor const &rhs) noexcept
Definition: tensor.hpp:768
auto abs(C const &c) noexcept
Returns the magnitude of the complex expression.
Definition: complex_operator.hpp:67
constexpr auto mean(Ex const &ex) noexcept
An alias name of mean_reduce.
Definition: operation.hpp:522
Tsor reshape(Tsor const &ts, std::vector< unsigned long > const &new_shape)
Definition: tensor.hpp:692
concept Expression
A type that represents a unary operator, a binary operator, a variable, a place_holder,...
Definition: operation.hpp:169
constexpr auto minus(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex) noexcept
Definition: operation.hpp:531
constexpr auto negative(Ex const &ex) noexcept
Definition: operation.hpp:389
auto sum(Tsor const &tsor)
Definition: tensor.hpp:1044
static constexpr auto make_unary_operator
Definition: operation.hpp:49
auto operator-(C const &c) noexcept
Negatives the complex expression.
Definition: complex_operator.hpp:163
constexpr bool is_unary_operator_v
Definition: operation.hpp:128
auto max(Tsor const &tsor)
Definition: tensor.hpp:1008
constexpr auto reduce_mean(Ex const &ex) noexcept
An alias name of mean_reduce.
Definition: operation.hpp:513
Tsor randn_like(Tsor const &tsor, typename Tsor::value_type mean=0, typename Tsor::value_type stddev=1)
Definition: tensor.hpp:884
Tsor clip(Tsor &tsor, typename Tsor::value_type lower=0, typename Tsor::value_type upper=1)
Definition: tensor.hpp:810
constexpr auto elementwise_product(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex) noexcept
Definition: operation.hpp:412
concept Variable
Definition: variable.hpp:186
Tsor copy(Tsor const &tsor)
Definition: tensor.hpp:908
constexpr auto hadamard_product(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex) noexcept
Definition: operation.hpp:444
bool has_inf(Tsor const &tsor)
Definition: tensor.hpp:1101
constexpr auto elementwise_multiply(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex) noexcept
Definition: operation.hpp:438
auto operator*(Cl const &cl, Cr const &cr) noexcept
Multiplies two complex expressions. Optimization here: (a+ib)*(c+id) = (ac-bd) + i(ad+bc) = (ac-bd) +...
Definition: complex_operator.hpp:200
constexpr auto mean_reduce(Ex const &ex) noexcept
Computes the mean of elements across all dimensions of an expression.
Definition: operation.hpp:488
auto transpose(Ex const &ex) noexcept
Definition: operation.hpp:821
requires std::floating_point< T > auto batch_normalization(T const momentum=0.98) noexcept
Definition: operation.hpp:1499
constexpr auto sinh(Ex const &ex) noexcept
Computes Sinh of the given expression.
Definition: operation.hpp:3620
constexpr auto concat(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex) noexcept
Definition: operation.hpp:1579
constexpr auto sin(Ex const &ex) noexcept
Computes Sin of the given expression.
Definition: operation.hpp:3583
auto reduce_sum(unsigned long axis) noexcept
Reduce sum elements along an axis.
Definition: operation.hpp:2415
constexpr auto equal(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex, FP threshold=0.5) noexcept
Definition: operation.hpp:1789
auto conv2d(unsigned long row_input, unsigned long col_input, unsigned long const row_stride=1, unsigned long const col_stride=1, unsigned long const row_dilation=1, unsigned long const col_dilation=1, std::string const &padding="valid") noexcept
Definition: operation.hpp:994
auto repeat(unsigned long repeats, unsigned long axis=-1) noexcept
Repeats elements along an axis.
Definition: operation.hpp:2055
constexpr auto erfc(Ex const &ex) noexcept
Computes Erfc of the given expression.
Definition: operation.hpp:2888
constexpr auto tan(Ex const &ex) noexcept
Computes Tan of the given expression.
Definition: operation.hpp:3694
constexpr auto cbrt(Ex const &ex) noexcept
Computes Cbert of the given expression.
Definition: operation.hpp:2706
constexpr auto log10(Ex const &ex) noexcept
Computes Log10 of the given expression.
Definition: operation.hpp:3268
constexpr auto log2(Ex const &ex) noexcept
Computes Log2 of the given expression.
Definition: operation.hpp:3342
auto ones_like(Ex const &ex) noexcept
Definition: operation.hpp:1744
constexpr auto log1p(Ex const &ex) noexcept
Computes Log1p of the given expression.
Definition: operation.hpp:3305
constexpr auto minimum(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex) noexcept
Definition: operation.hpp:1630
*endcode **constexpr auto hypot(Ex const &ex, Ey const &ey) noexcept
Definition: operation.hpp:632
constexpr auto exp2(Ex const &ex) noexcept
Computes Exp2 of the given expression.
Definition: operation.hpp:2963
constexpr auto cosh(Ex const &ex) noexcept
Computes Cosh of the given expression.
Definition: operation.hpp:2814
constexpr auto ceil(Ex const &ex) noexcept
Computes Ceil of the given expression.
Definition: operation.hpp:2743
constexpr auto atan2(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex) noexcept
Computes the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
Definition: operation.hpp:1672
constexpr auto cos(Ex const &ex) noexcept
Computes Cos of the given expression.
Definition: operation.hpp:2777
requires std::floating_point< T > auto random_normal_like(T mean=0.0, T stddev=1.0) noexcept
Definition: operation.hpp:1714
constexpr auto round(Ex const &ex) noexcept
Computes Round of the given expression.
Definition: operation.hpp:3550
auto up_sampling_2d(unsigned long stride) noexcept
Definition: operation.hpp:1350
auto reduce_min(unsigned long axis=-1) noexcept
Reduce minimal elements along an axis.
Definition: operation.hpp:2181
auto average_pooling_2d(unsigned long stride) noexcept
Definition: operation.hpp:1216
constexpr auto expm1(Ex const &ex) noexcept
Computes Expm1 of the given expression.
Definition: operation.hpp:3000
constexpr auto rint(Ex const &ex) noexcept
Computes Rint of the given expression.
Definition: operation.hpp:3517
constexpr auto exp(Ex const &ex) noexcept
Computes Exp of the given expression.
Definition: operation.hpp:2926
constexpr auto asin(Ex const &ex) noexcept
Computes Asin of the given expression.
Definition: operation.hpp:2558
constexpr auto sqrt(Ex const &ex) noexcept
Computes Sqrt of the given expression.
Definition: operation.hpp:3657
constexpr auto fabs(Ex const &ex) noexcept
Computes Fabs of the given expression.
Definition: operation.hpp:3037
constexpr auto maximum(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex) noexcept
Definition: operation.hpp:1591
auto img2col(unsigned long const row_kernel, unsigned long col_kernel=-1, unsigned long const row_padding=0, unsigned long col_padding=0, unsigned long const row_stride=1, unsigned long const col_stride=1, unsigned long const row_dilation=1, unsigned long const col_dilation=1) noexcept
Definition: operation.hpp:870
constexpr auto llround(Ex const &ex) noexcept
Computes Llround of the given expression.
Definition: operation.hpp:3198
constexpr auto llrint(Ex const &ex) noexcept
Computes Llrint of the given expression.
Definition: operation.hpp:3165
constexpr auto acosh(Ex const &ex) noexcept
Computes Acosh of the given expression.
Definition: operation.hpp:2521
constexpr auto acos(Ex const &ex) noexcept
Computes Acos of the given expression.
Definition: operation.hpp:2484
constexpr auto log(Ex const &ex) noexcept
Computes Log of the given expression.
Definition: operation.hpp:3231
constexpr auto atan(Ex const &ex) noexcept
Computes Atan of the given expression.
Definition: operation.hpp:2632
constexpr auto floor(Ex const &ex) noexcept
Computes Floor of the given expression.
Definition: operation.hpp:3057
constexpr auto abs(Ex const &ex) noexcept
Computes Abs of the given expression.
Definition: operation.hpp:2447
constexpr auto asinh(Ex const &ex) noexcept
Computes Asinh of the given expression.
Definition: operation.hpp:2595
constexpr auto trunc(Ex const &ex) noexcept
Computes Trunc of the given expression.
Definition: operation.hpp:3806
auto max_pooling_2d(unsigned long stride) noexcept
Definition: operation.hpp:1197
*auto y
Definition: operation.hpp:627
constexpr auto identity(Ex const &ex) noexcept
Definition: operation.hpp:804
requires std::floating_point< T > auto drop_out(T const factor) noexcept
Definition: operation.hpp:1044
constexpr auto concatenate(Lhs_Expression const &lhs_ex, Rhs_Expression const &rhs_ex) noexcept
Definition: operation.hpp:1517
constexpr auto tanh(Ex const &ex) noexcept
Computes Tanh of the given expression.
Definition: operation.hpp:3731
constexpr auto flatten(Ex const &ex) noexcept
Definition: operation.hpp:782
auto zero_padding_2d(std::vector< unsigned long > const &padding) noexcept
Zero-padding layer for 2D input. The input should have 4-dimensions: (batch_size, row,...
Definition: operation.hpp:1936
constexpr auto erf(Ex const &ex) noexcept
Computes Erf of the given expression.
Definition: operation.hpp:2851
constexpr auto sign(Ex const &ex) noexcept
Definition: operation.hpp:1829
auto reduce_max(unsigned long axis=-1) noexcept
Reduce maximum elements along an axis.
Definition: operation.hpp:2307
constexpr auto lrint(Ex const &ex) noexcept
Computes Lrint of the given expression.
Definition: operation.hpp:3417
constexpr auto nearbyint(Ex const &ex) noexcept
Computes Nearbyint of the given expression.
Definition: operation.hpp:3484
requires std::floating_point< T > auto normalization_batch(T const momentum=0.98) noexcept
Definition: operation.hpp:1371
constexpr auto lround(Ex const &ex) noexcept
Computes Lround of the given expression.
Definition: operation.hpp:3451
constexpr auto atanh(Ex const &ex) noexcept
Computes Atanh of the given expression.
Definition: operation.hpp:2669
auto zeros_like(Ex const &ex) noexcept
Definition: operation.hpp:1764
Definition: operation.hpp:61
Forward_Action forward_action_
Definition: operation.hpp:64
tensor_type rhs_input_data_
Definition: operation.hpp:70
Rhs_Operator rhs_op_
Definition: operation.hpp:63
binary_operator(Lhs_Operator const &lhs_op, Rhs_Operator const &rhs_op, Forward_Action const &forward_action, Backward_Action const &backward_action) noexcept
Definition: operation.hpp:73
tensor_type output_data_
Definition: operation.hpp:71
Backward_Action backward_action_
Definition: operation.hpp:65
Lhs_Operator lhs_op_
Definition: operation.hpp:62
void backward(tensor_type const &grad)
Definition: operation.hpp:99
tensor_deduction< Lhs_Operator, Rhs_Operator >::tensor_type tensor_type
Definition: operation.hpp:67
tensor_type lhs_input_data_
Definition: operation.hpp:69
auto forward()
Definition: operation.hpp:76
Definition: operation.hpp:139
Definition: operation.hpp:119
Definition: value.hpp:53
Definition: tensor.hpp:32
Definition: operation.hpp:21
Forward_Action forward_action_
Definition: operation.hpp:23
decltype(std::declval< Forward_Action >()(std::declval< decltype(op_)>().forward())) typedef tensor_type
Definition: operation.hpp:26
Operator op_
Definition: operation.hpp:22
Backward_Action backward_action_
Definition: operation.hpp:24
void backward(tensor_type const &grad)
Definition: operation.hpp:41
tensor_type output_data_
Definition: operation.hpp:29
auto forward()
Definition: operation.hpp:34
tensor_type input_data_
Definition: operation.hpp:28