univariate_statistics.hpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. // (C) Copyright Nick Thompson 2018.
  2. // Use, modification and distribution are subject to the
  3. // Boost Software License, Version 1.0. (See accompanying file
  4. // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. #ifndef BOOST_MATH_TOOLS_UNIVARIATE_STATISTICS_HPP
  6. #define BOOST_MATH_TOOLS_UNIVARIATE_STATISTICS_HPP
  7. #include <algorithm>
  8. #include <iterator>
  9. #include <tuple>
  10. #include <boost/math/tools/assert.hpp>
  11. #include <boost/math/tools/header_deprecated.hpp>
  12. #include <boost/math/tools/is_standalone.hpp>
  13. #ifndef BOOST_MATH_STANDALONE
  14. #include <boost/config.hpp>
  15. #ifdef BOOST_MATH_NO_CXX17_IF_CONSTEXPR
  16. #error "The header <boost/math/norms.hpp> can only be used in C++17 and later."
  17. #endif
  18. #endif
  19. BOOST_MATH_HEADER_DEPRECATED("<boost/math/statistics/univariate_statistics.hpp>");
  20. namespace boost::math::tools {
  21. template<class ForwardIterator>
  22. auto mean(ForwardIterator first, ForwardIterator last)
  23. {
  24. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  25. BOOST_MATH_ASSERT_MSG(first != last, "At least one sample is required to compute the mean.");
  26. if constexpr (std::is_integral<Real>::value)
  27. {
  28. double mu = 0;
  29. double i = 1;
  30. for(auto it = first; it != last; ++it) {
  31. mu = mu + (*it - mu)/i;
  32. i += 1;
  33. }
  34. return mu;
  35. }
  36. else if constexpr (std::is_same_v<typename std::iterator_traits<ForwardIterator>::iterator_category, std::random_access_iterator_tag>)
  37. {
  38. size_t elements = std::distance(first, last);
  39. Real mu0 = 0;
  40. Real mu1 = 0;
  41. Real mu2 = 0;
  42. Real mu3 = 0;
  43. Real i = 1;
  44. auto end = last - (elements % 4);
  45. for(auto it = first; it != end; it += 4) {
  46. Real inv = Real(1)/i;
  47. Real tmp0 = (*it - mu0);
  48. Real tmp1 = (*(it+1) - mu1);
  49. Real tmp2 = (*(it+2) - mu2);
  50. Real tmp3 = (*(it+3) - mu3);
  51. // please generate a vectorized fma here
  52. mu0 += tmp0*inv;
  53. mu1 += tmp1*inv;
  54. mu2 += tmp2*inv;
  55. mu3 += tmp3*inv;
  56. i += 1;
  57. }
  58. Real num1 = Real(elements - (elements %4))/Real(4);
  59. Real num2 = num1 + Real(elements % 4);
  60. for (auto it = end; it != last; ++it)
  61. {
  62. mu3 += (*it-mu3)/i;
  63. i += 1;
  64. }
  65. return (num1*(mu0+mu1+mu2) + num2*mu3)/Real(elements);
  66. }
  67. else
  68. {
  69. auto it = first;
  70. Real mu = *it;
  71. Real i = 2;
  72. while(++it != last)
  73. {
  74. mu += (*it - mu)/i;
  75. i += 1;
  76. }
  77. return mu;
  78. }
  79. }
  80. template<class Container>
  81. inline auto mean(Container const & v)
  82. {
  83. return mean(v.cbegin(), v.cend());
  84. }
  85. template<class ForwardIterator>
  86. auto variance(ForwardIterator first, ForwardIterator last)
  87. {
  88. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  89. BOOST_MATH_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance.");
  90. // Higham, Accuracy and Stability, equation 1.6a and 1.6b:
  91. if constexpr (std::is_integral<Real>::value)
  92. {
  93. double M = *first;
  94. double Q = 0;
  95. double k = 2;
  96. for (auto it = std::next(first); it != last; ++it)
  97. {
  98. double tmp = *it - M;
  99. Q = Q + ((k-1)*tmp*tmp)/k;
  100. M = M + tmp/k;
  101. k += 1;
  102. }
  103. return Q/(k-1);
  104. }
  105. else
  106. {
  107. Real M = *first;
  108. Real Q = 0;
  109. Real k = 2;
  110. for (auto it = std::next(first); it != last; ++it)
  111. {
  112. Real tmp = (*it - M)/k;
  113. Q += k*(k-1)*tmp*tmp;
  114. M += tmp;
  115. k += 1;
  116. }
  117. return Q/(k-1);
  118. }
  119. }
  120. template<class Container>
  121. inline auto variance(Container const & v)
  122. {
  123. return variance(v.cbegin(), v.cend());
  124. }
  125. template<class ForwardIterator>
  126. auto sample_variance(ForwardIterator first, ForwardIterator last)
  127. {
  128. size_t n = std::distance(first, last);
  129. BOOST_MATH_ASSERT_MSG(n > 1, "At least two samples are required to compute the sample variance.");
  130. return n*variance(first, last)/(n-1);
  131. }
  132. template<class Container>
  133. inline auto sample_variance(Container const & v)
  134. {
  135. return sample_variance(v.cbegin(), v.cend());
  136. }
  137. // Follows equation 1.5 of:
  138. // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
  139. template<class ForwardIterator>
  140. auto skewness(ForwardIterator first, ForwardIterator last)
  141. {
  142. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  143. BOOST_MATH_ASSERT_MSG(first != last, "At least one sample is required to compute skewness.");
  144. if constexpr (std::is_integral<Real>::value)
  145. {
  146. double M1 = *first;
  147. double M2 = 0;
  148. double M3 = 0;
  149. double n = 2;
  150. for (auto it = std::next(first); it != last; ++it)
  151. {
  152. double delta21 = *it - M1;
  153. double tmp = delta21/n;
  154. M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  155. M2 = M2 + tmp*(n-1)*delta21;
  156. M1 = M1 + tmp;
  157. n += 1;
  158. }
  159. double var = M2/(n-1);
  160. if (var == 0)
  161. {
  162. // The limit is technically undefined, but the interpretation here is clear:
  163. // A constant dataset has no skewness.
  164. return static_cast<double>(0);
  165. }
  166. double skew = M3/(M2*sqrt(var));
  167. return skew;
  168. }
  169. else
  170. {
  171. Real M1 = *first;
  172. Real M2 = 0;
  173. Real M3 = 0;
  174. Real n = 2;
  175. for (auto it = std::next(first); it != last; ++it)
  176. {
  177. Real delta21 = *it - M1;
  178. Real tmp = delta21/n;
  179. M3 += tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  180. M2 += tmp*(n-1)*delta21;
  181. M1 += tmp;
  182. n += 1;
  183. }
  184. Real var = M2/(n-1);
  185. if (var == 0)
  186. {
  187. // The limit is technically undefined, but the interpretation here is clear:
  188. // A constant dataset has no skewness.
  189. return Real(0);
  190. }
  191. Real skew = M3/(M2*sqrt(var));
  192. return skew;
  193. }
  194. }
  195. template<class Container>
  196. inline auto skewness(Container const & v)
  197. {
  198. return skewness(v.cbegin(), v.cend());
  199. }
  200. // Follows equation 1.5/1.6 of:
  201. // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
  202. template<class ForwardIterator>
  203. auto first_four_moments(ForwardIterator first, ForwardIterator last)
  204. {
  205. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  206. BOOST_MATH_ASSERT_MSG(first != last, "At least one sample is required to compute the first four moments.");
  207. if constexpr (std::is_integral<Real>::value)
  208. {
  209. double M1 = *first;
  210. double M2 = 0;
  211. double M3 = 0;
  212. double M4 = 0;
  213. double n = 2;
  214. for (auto it = std::next(first); it != last; ++it)
  215. {
  216. double delta21 = *it - M1;
  217. double tmp = delta21/n;
  218. M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
  219. M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  220. M2 = M2 + tmp*(n-1)*delta21;
  221. M1 = M1 + tmp;
  222. n += 1;
  223. }
  224. return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
  225. }
  226. else
  227. {
  228. Real M1 = *first;
  229. Real M2 = 0;
  230. Real M3 = 0;
  231. Real M4 = 0;
  232. Real n = 2;
  233. for (auto it = std::next(first); it != last; ++it)
  234. {
  235. Real delta21 = *it - M1;
  236. Real tmp = delta21/n;
  237. M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
  238. M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  239. M2 = M2 + tmp*(n-1)*delta21;
  240. M1 = M1 + tmp;
  241. n += 1;
  242. }
  243. return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
  244. }
  245. }
  246. template<class Container>
  247. inline auto first_four_moments(Container const & v)
  248. {
  249. return first_four_moments(v.cbegin(), v.cend());
  250. }
  251. // Follows equation 1.6 of:
  252. // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
  253. template<class ForwardIterator>
  254. auto kurtosis(ForwardIterator first, ForwardIterator last)
  255. {
  256. auto [M1, M2, M3, M4] = first_four_moments(first, last);
  257. if (M2 == 0)
  258. {
  259. return M2;
  260. }
  261. return M4/(M2*M2);
  262. }
  263. template<class Container>
  264. inline auto kurtosis(Container const & v)
  265. {
  266. return kurtosis(v.cbegin(), v.cend());
  267. }
  268. template<class ForwardIterator>
  269. auto excess_kurtosis(ForwardIterator first, ForwardIterator last)
  270. {
  271. return kurtosis(first, last) - 3;
  272. }
  273. template<class Container>
  274. inline auto excess_kurtosis(Container const & v)
  275. {
  276. return excess_kurtosis(v.cbegin(), v.cend());
  277. }
  278. template<class RandomAccessIterator>
  279. auto median(RandomAccessIterator first, RandomAccessIterator last)
  280. {
  281. size_t num_elems = std::distance(first, last);
  282. BOOST_MATH_ASSERT_MSG(num_elems > 0, "The median of a zero length vector is undefined.");
  283. if (num_elems & 1)
  284. {
  285. auto middle = first + (num_elems - 1)/2;
  286. std::nth_element(first, middle, last);
  287. return *middle;
  288. }
  289. else
  290. {
  291. auto middle = first + num_elems/2 - 1;
  292. std::nth_element(first, middle, last);
  293. std::nth_element(middle, middle+1, last);
  294. return (*middle + *(middle+1))/2;
  295. }
  296. }
  297. template<class RandomAccessContainer>
  298. inline auto median(RandomAccessContainer & v)
  299. {
  300. return median(v.begin(), v.end());
  301. }
  302. template<class RandomAccessIterator>
  303. auto gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
  304. {
  305. using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
  306. BOOST_MATH_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples.");
  307. std::sort(first, last);
  308. if constexpr (std::is_integral<Real>::value)
  309. {
  310. double i = 1;
  311. double num = 0;
  312. double denom = 0;
  313. for (auto it = first; it != last; ++it)
  314. {
  315. num += *it*i;
  316. denom += *it;
  317. ++i;
  318. }
  319. // If the l1 norm is zero, all elements are zero, so every element is the same.
  320. if (denom == 0)
  321. {
  322. return static_cast<double>(0);
  323. }
  324. return ((2*num)/denom - i)/(i-1);
  325. }
  326. else
  327. {
  328. Real i = 1;
  329. Real num = 0;
  330. Real denom = 0;
  331. for (auto it = first; it != last; ++it)
  332. {
  333. num += *it*i;
  334. denom += *it;
  335. ++i;
  336. }
  337. // If the l1 norm is zero, all elements are zero, so every element is the same.
  338. if (denom == 0)
  339. {
  340. return Real(0);
  341. }
  342. return ((2*num)/denom - i)/(i-1);
  343. }
  344. }
  345. template<class RandomAccessContainer>
  346. inline auto gini_coefficient(RandomAccessContainer & v)
  347. {
  348. return gini_coefficient(v.begin(), v.end());
  349. }
  350. template<class RandomAccessIterator>
  351. inline auto sample_gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
  352. {
  353. size_t n = std::distance(first, last);
  354. return n*gini_coefficient(first, last)/(n-1);
  355. }
  356. template<class RandomAccessContainer>
  357. inline auto sample_gini_coefficient(RandomAccessContainer & v)
  358. {
  359. return sample_gini_coefficient(v.begin(), v.end());
  360. }
  361. template<class RandomAccessIterator>
  362. auto median_absolute_deviation(RandomAccessIterator first, RandomAccessIterator last, typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<typename std::iterator_traits<RandomAccessIterator>::value_type>::quiet_NaN())
  363. {
  364. using std::abs;
  365. using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
  366. using std::isnan;
  367. if (isnan(center))
  368. {
  369. center = boost::math::tools::median(first, last);
  370. }
  371. size_t num_elems = std::distance(first, last);
  372. BOOST_MATH_ASSERT_MSG(num_elems > 0, "The median of a zero-length vector is undefined.");
  373. auto comparator = [&center](Real a, Real b) { return abs(a-center) < abs(b-center);};
  374. if (num_elems & 1)
  375. {
  376. auto middle = first + (num_elems - 1)/2;
  377. std::nth_element(first, middle, last, comparator);
  378. return abs(*middle);
  379. }
  380. else
  381. {
  382. auto middle = first + num_elems/2 - 1;
  383. std::nth_element(first, middle, last, comparator);
  384. std::nth_element(middle, middle+1, last, comparator);
  385. return (abs(*middle) + abs(*(middle+1)))/abs(static_cast<Real>(2));
  386. }
  387. }
  388. template<class RandomAccessContainer>
  389. inline auto median_absolute_deviation(RandomAccessContainer & v, typename RandomAccessContainer::value_type center=std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN())
  390. {
  391. return median_absolute_deviation(v.begin(), v.end(), center);
  392. }
  393. }
  394. #endif