fill_n.hpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. // Copyright 2019 Hans Dembinski
  2. //
  3. // Distributed under the Boost Software License, Version 1.0.
  4. // (See accompanying file LICENSE_1_0.txt
  5. // or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef BOOST_HISTOGRAM_DETAIL_FILL_N_HPP
  7. #define BOOST_HISTOGRAM_DETAIL_FILL_N_HPP
  8. #include <algorithm>
  9. #include <boost/core/make_span.hpp>
  10. #include <boost/core/span.hpp>
  11. #include <boost/histogram/axis/option.hpp>
  12. #include <boost/histogram/axis/traits.hpp>
  13. #include <boost/histogram/detail/axes.hpp>
  14. #include <boost/histogram/detail/detect.hpp>
  15. #include <boost/histogram/detail/fill.hpp>
  16. #include <boost/histogram/detail/linearize.hpp>
  17. #include <boost/histogram/detail/nonmember_container_access.hpp>
  18. #include <boost/histogram/detail/optional_index.hpp>
  19. #include <boost/histogram/detail/static_if.hpp>
  20. #include <boost/histogram/fwd.hpp>
  21. #include <boost/mp11/algorithm.hpp>
  22. #include <boost/mp11/bind.hpp>
  23. #include <boost/mp11/utility.hpp>
  24. #include <boost/throw_exception.hpp>
  25. #include <boost/variant2/variant.hpp>
  26. #include <cassert>
  27. #include <initializer_list>
  28. #include <stdexcept>
  29. #include <type_traits>
  30. #include <utility>
  31. namespace boost {
  32. namespace histogram {
  33. namespace detail {
  34. namespace dtl = boost::histogram::detail;
  35. template <class Axes, class T>
  36. using is_convertible_to_any_value_type =
  37. mp11::mp_any_of_q<value_types<Axes>, mp11::mp_bind_front<std::is_convertible, T>>;
  38. template <class T>
  39. auto to_ptr_size(const T& x) {
  40. return static_if<std::is_scalar<T>>(
  41. [](const auto& x) { return std::make_pair(&x, static_cast<std::size_t>(0)); },
  42. [](const auto& x) { return std::make_pair(dtl::data(x), dtl::size(x)); }, x);
  43. }
  44. template <class F, class V>
  45. decltype(auto) maybe_visit(F&& f, V&& v) {
  46. return static_if<is_variant<std::decay_t<V>>>(
  47. [](auto&& f, auto&& v) {
  48. return variant2::visit(std::forward<F>(f), std::forward<V>(v));
  49. },
  50. [](auto&& f, auto&& v) { return std::forward<F>(f)(std::forward<V>(v)); },
  51. std::forward<F>(f), std::forward<V>(v));
  52. }
  53. template <class Index, class Axis, class IsGrowing>
  54. struct index_visitor {
  55. using index_type = Index;
  56. using pointer = index_type*;
  57. using value_type = axis::traits::value_type<Axis>;
  58. using Opt = axis::traits::get_options<Axis>;
  59. Axis& axis_;
  60. const std::size_t stride_, start_, size_; // start and size of value collection
  61. const pointer begin_;
  62. axis::index_type* shift_;
  63. index_visitor(Axis& a, std::size_t& str, const std::size_t& sta, const std::size_t& si,
  64. const pointer it, axis::index_type* shift)
  65. : axis_(a), stride_(str), start_(sta), size_(si), begin_(it), shift_(shift) {}
  66. template <class T>
  67. void call_2(std::true_type, pointer it, const T& x) const {
  68. // must use this code for all axes if one of them is growing
  69. axis::index_type shift;
  70. linearize_growth(*it, shift, stride_, axis_,
  71. try_cast<value_type, std::invalid_argument>(x));
  72. if (shift > 0) { // shift previous indices, because axis zero-point has changed
  73. while (it != begin_) *--it += static_cast<std::size_t>(shift) * stride_;
  74. *shift_ += shift;
  75. }
  76. }
  77. template <class T>
  78. void call_2(std::false_type, pointer it, const T& x) const {
  79. // no axis is growing
  80. linearize(*it, stride_, axis_, try_cast<value_type, std::invalid_argument>(x));
  81. }
  82. template <class T>
  83. void call_1(std::false_type, const T& iterable) const {
  84. // T is iterable; fill N values
  85. const auto* tp = dtl::data(iterable) + start_;
  86. for (auto it = begin_; it != begin_ + size_; ++it) call_2(IsGrowing{}, it, *tp++);
  87. }
  88. template <class T>
  89. void call_1(std::true_type, const T& value) const {
  90. // T is compatible value; fill single value N times
  91. // Optimization: We call call_2 only once and then add the index shift onto the
  92. // whole array of indices, because it is always the same. This also works if the
  93. // axis grows during this operation. There are no shifts to apply if the zero-point
  94. // changes.
  95. const auto before = *begin_;
  96. call_2(IsGrowing{}, begin_, value);
  97. if (is_valid(*begin_)) {
  98. // since index can be std::size_t or optional_index, must do conversion here
  99. const auto delta =
  100. static_cast<std::intptr_t>(*begin_) - static_cast<std::intptr_t>(before);
  101. for (auto it = begin_ + 1; it != begin_ + size_; ++it) *it += delta;
  102. } else
  103. std::fill(begin_, begin_ + size_, invalid_index);
  104. }
  105. template <class T>
  106. void operator()(const T& iterable_or_value) const {
  107. call_1(mp11::mp_bool<(std::is_convertible<T, value_type>::value ||
  108. !is_iterable<T>::value)>{},
  109. iterable_or_value);
  110. }
  111. };
  112. template <class Index, class S, class Axes, class T>
  113. void fill_n_indices(Index* indices, const std::size_t start, const std::size_t size,
  114. const std::size_t offset, S& storage, Axes& axes, const T* viter) {
  115. axis::index_type extents[buffer_size<Axes>::value];
  116. axis::index_type shifts[buffer_size<Axes>::value];
  117. for_each_axis(axes, [eit = extents, sit = shifts](const auto& a) mutable {
  118. *sit++ = 0;
  119. *eit++ = axis::traits::extent(a);
  120. }); // LCOV_EXCL_LINE: gcc-8 is missing this line for no reason
  121. // TODO this seems to always take the path for growing axes, even if Axes is vector
  122. // of variant and types actually held are not growing axes?
  123. // index offset must be zero for growing axes
  124. using IsGrowing = has_growing_axis<Axes>;
  125. std::fill(indices, indices + size, IsGrowing::value ? 0 : offset);
  126. for_each_axis(axes, [&, stride = static_cast<std::size_t>(1),
  127. pshift = shifts](auto& axis) mutable {
  128. using Axis = std::decay_t<decltype(axis)>;
  129. maybe_visit(
  130. index_visitor<Index, Axis, IsGrowing>{axis, stride, start, size, indices, pshift},
  131. *viter++);
  132. stride *= static_cast<std::size_t>(axis::traits::extent(axis));
  133. ++pshift;
  134. });
  135. bool update_needed = false;
  136. for_each_axis(axes, [&update_needed, eit = extents](const auto& a) mutable {
  137. update_needed |= *eit++ != axis::traits::extent(a);
  138. });
  139. if (update_needed) {
  140. storage_grower<Axes> g(axes);
  141. g.from_extents(extents);
  142. g.apply(storage, shifts);
  143. }
  144. }
  145. template <class S, class Index, class... Ts>
  146. void fill_n_storage(S& s, const Index idx, Ts&&... p) noexcept {
  147. if (is_valid(idx)) {
  148. assert(idx < s.size());
  149. fill_storage_element(s[idx], *p.first...);
  150. }
  151. // operator folding emulation
  152. (void)std::initializer_list<int>{(p.second ? (++p.first, 0) : 0)...};
  153. }
  154. template <class S, class Index, class T, class... Ts>
  155. void fill_n_storage(S& s, const Index idx, weight_type<T>&& w, Ts&&... ps) noexcept {
  156. if (is_valid(idx)) {
  157. assert(idx < s.size());
  158. fill_storage_element(s[idx], weight(*w.value.first), *ps.first...);
  159. }
  160. if (w.value.second) ++w.value.first;
  161. // operator folding emulation
  162. (void)std::initializer_list<int>{(ps.second ? (++ps.first, 0) : 0)...};
  163. }
  164. // general Nd treatment
  165. template <class Index, class S, class A, class T, class... Ts>
  166. void fill_n_nd(const std::size_t offset, S& storage, A& axes, const std::size_t vsize,
  167. const T* values, Ts&&... ts) {
  168. constexpr std::size_t buffer_size = 1ul << 14;
  169. Index indices[buffer_size];
  170. /*
  171. Parallelization options.
  172. A) Run the whole fill2 method in parallel, each thread fills its own buffer of
  173. indices, synchronization (atomics) are needed to synchronize the incrementing of
  174. the storage cells. This leads to a lot of congestion for small histograms.
  175. B) Run only fill_n_indices in parallel, subsections of the indices buffer
  176. can be filled by different threads. The final loop that fills the storage runs
  177. in the main thread, this requires no synchronization for the storage, cells do
  178. not need to support atomic operations.
  179. C) Like B), then sort the indices in the main thread and fill the
  180. storage in parallel, where each thread uses a disjunct set of indices. This
  181. should create less congestion and requires no synchronization for the storage.
  182. Note on C): Let's say we have an axis with 5 bins (with *flow to simplify).
  183. Then after filling 10 values, converting to indices and sorting, the index
  184. buffer may look like this: 0 0 0 1 2 2 2 4 4 5. Let's use two threads to fill
  185. the storage. Still in the main thread, we compute an iterator to the middle of
  186. the index buffer and move it to the right until the pointee changes. Now we have
  187. two ranges which contain disjunct sets of indices. We pass these ranges to the
  188. threads which then fill the storage. Since the threads by construction do not
  189. compete to increment the same cell, no further synchronization is required.
  190. In all cases, growing axes cannot be parallelized.
  191. */
  192. for (std::size_t start = 0; start < vsize; start += buffer_size) {
  193. const std::size_t n = (std::min)(buffer_size, vsize - start);
  194. // fill buffer of indices...
  195. fill_n_indices(indices, start, n, offset, storage, axes, values);
  196. // ...and fill corresponding storage cells
  197. for (auto&& idx : make_span(indices, n))
  198. fill_n_storage(storage, idx, std::forward<Ts>(ts)...);
  199. }
  200. }
  201. template <class S, class... As, class T, class... Us>
  202. void fill_n_1(const std::size_t offset, S& storage, std::tuple<As...>& axes,
  203. const std::size_t vsize, const T* values, Us&&... us) {
  204. using index_type =
  205. mp11::mp_if<has_non_inclusive_axis<std::tuple<As...>>, optional_index, std::size_t>;
  206. fill_n_nd<index_type>(offset, storage, axes, vsize, values, std::forward<Us>(us)...);
  207. }
  208. template <class S, class A, class T, class... Us>
  209. void fill_n_1(const std::size_t offset, S& storage, A& axes, const std::size_t vsize,
  210. const T* values, Us&&... us) {
  211. bool all_inclusive = true;
  212. for_each_axis(axes,
  213. [&](const auto& ax) { all_inclusive &= axis::traits::inclusive(ax); });
  214. if (axes_rank(axes) == 1) {
  215. // Optimization: benchmark shows that this makes filling dynamic 1D histogram faster
  216. axis::visit(
  217. [&](auto& ax) {
  218. std::tuple<decltype(ax)> axes{ax};
  219. fill_n_1(offset, storage, axes, vsize, values, std::forward<Us>(us)...);
  220. },
  221. axes[0]);
  222. } else {
  223. if (all_inclusive)
  224. fill_n_nd<std::size_t>(offset, storage, axes, vsize, values,
  225. std::forward<Us>(us)...);
  226. else
  227. fill_n_nd<optional_index>(offset, storage, axes, vsize, values,
  228. std::forward<Us>(us)...);
  229. }
  230. }
  231. template <class A, class T, std::size_t N>
  232. std::size_t get_total_size(const A& axes, const span<const T, N>& values) {
  233. // supported cases (T = value type; CT = containter of T; V<T, CT, ...> = variant):
  234. // - span<CT, N>: for any histogram, N == rank
  235. // - span<V<T, CT>, N>: for any histogram, N == rank
  236. assert(axes_rank(axes) == values.size());
  237. constexpr auto unset = static_cast<std::size_t>(-1);
  238. std::size_t size = unset;
  239. for_each_axis(axes, [&size, vit = values.begin()](const auto& ax) mutable {
  240. using AV = axis::traits::value_type<std::decay_t<decltype(ax)>>;
  241. maybe_visit(
  242. [&size](const auto& v) {
  243. // v is either convertible to value or a sequence of values
  244. using V = std::remove_const_t<std::remove_reference_t<decltype(v)>>;
  245. static_if_c<(std::is_convertible<decltype(v), AV>::value ||
  246. !is_iterable<V>::value)>(
  247. [](const auto&) {},
  248. [&size](const auto& v) {
  249. const auto n = dtl::size(v);
  250. // must repeat this here for msvc :(
  251. constexpr auto unset = static_cast<std::size_t>(-1);
  252. if (size == unset)
  253. size = dtl::size(v);
  254. else if (size != n)
  255. BOOST_THROW_EXCEPTION(
  256. std::invalid_argument("spans must have compatible lengths"));
  257. },
  258. v);
  259. },
  260. *vit++);
  261. });
  262. // if all arguments are not iterables, return size of 1
  263. return size == unset ? 1 : size;
  264. }
  265. inline void fill_n_check_extra_args(std::size_t) noexcept {}
  266. template <class T, class... Ts>
  267. void fill_n_check_extra_args(std::size_t size, T&& x, Ts&&... ts) {
  268. // sequences must have same lengths, but sequences of length 0 are broadcast
  269. if (x.second != 0 && x.second != size)
  270. BOOST_THROW_EXCEPTION(std::invalid_argument("spans must have compatible lengths"));
  271. fill_n_check_extra_args(size, std::forward<Ts>(ts)...);
  272. }
  273. template <class T, class... Ts>
  274. void fill_n_check_extra_args(std::size_t size, weight_type<T>&& w, Ts&&... ts) {
  275. fill_n_check_extra_args(size, w.value, std::forward<Ts>(ts)...);
  276. }
  277. template <class S, class A, class T, std::size_t N, class... Us>
  278. void fill_n(std::true_type, const std::size_t offset, S& storage, A& axes,
  279. const span<const T, N> values, Us&&... us) {
  280. // supported cases (T = value type; CT = containter of T; V<T, CT, ...> = variant):
  281. // - span<T, N>: only valid for 1D histogram, N > 1 allowed
  282. // - span<CT, N>: for any histogram, N == rank
  283. // - span<V<T, CT>, N>: for any histogram, N == rank
  284. static_if<is_convertible_to_any_value_type<A, T>>(
  285. [&](const auto& values, auto&&... us) {
  286. // T matches one of the axis value types, must be 1D special case
  287. if (axes_rank(axes) != 1)
  288. BOOST_THROW_EXCEPTION(
  289. std::invalid_argument("number of arguments must match histogram rank"));
  290. fill_n_check_extra_args(values.size(), std::forward<Us>(us)...);
  291. fill_n_1(offset, storage, axes, values.size(), &values, std::forward<Us>(us)...);
  292. },
  293. [&](const auto& values, auto&&... us) {
  294. // generic ND case
  295. if (axes_rank(axes) != values.size())
  296. BOOST_THROW_EXCEPTION(
  297. std::invalid_argument("number of arguments must match histogram rank"));
  298. const auto vsize = get_total_size(axes, values);
  299. fill_n_check_extra_args(vsize, std::forward<Us>(us)...);
  300. fill_n_1(offset, storage, axes, vsize, values.data(), std::forward<Us>(us)...);
  301. },
  302. values, std::forward<Us>(us)...);
  303. }
  304. // empty implementation for bad arguments to stop compiler from showing internals
  305. template <class... Ts>
  306. void fill_n(std::false_type, Ts...) {}
  307. } // namespace detail
  308. } // namespace histogram
  309. } // namespace boost
  310. #endif // BOOST_HISTOGRAM_DETAIL_FILL_N_HPP