cma_es.hpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. /*
  2. * Copyright Nick Thompson, 2024
  3. * Use, modification and distribution are subject to the
  4. * Boost Software License, Version 1.0. (See accompanying file
  5. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. */
  7. #ifndef BOOST_MATH_OPTIMIZATION_CMA_ES_HPP
  8. #define BOOST_MATH_OPTIMIZATION_CMA_ES_HPP
  9. #include <atomic>
  10. #include <cmath>
  11. #include <iostream>
  12. #include <limits>
  13. #include <random>
  14. #include <sstream>
  15. #include <stdexcept>
  16. #include <utility>
  17. #include <vector>
  18. #include <boost/math/optimization/detail/common.hpp>
  19. #include <boost/math/tools/assert.hpp>
  20. #if __has_include(<Eigen/Dense>)
  21. #include <Eigen/Dense>
  22. #else
  23. #error "CMA-ES requires Eigen."
  24. #endif
  25. // Follows the notation in:
  26. // https://arxiv.org/pdf/1604.00772.pdf
  27. // This is a (hopefully) faithful reproduction of the pseudocode in the arxiv review
  28. // by Nikolaus Hansen.
  29. // Comments referring to equations all refer to this arxiv review.
  30. // A slide deck by the same author is given here:
  31. // http://www.cmap.polytechnique.fr/~nikolaus.hansen/CmaTutorialGecco2023-no-audio.pdf
  32. // which is also a very useful reference.
  33. #ifndef BOOST_MATH_DEBUG_CMA_ES
  34. #define BOOST_MATH_DEBUG_CMA_ES 0
  35. #endif
  36. namespace boost::math::optimization {
  37. template <typename ArgumentContainer> struct cma_es_parameters {
  38. using Real = typename ArgumentContainer::value_type;
  39. using DimensionlessReal = decltype(Real()/Real());
  40. ArgumentContainer lower_bounds;
  41. ArgumentContainer upper_bounds;
  42. size_t max_generations = 1000;
  43. ArgumentContainer const *initial_guess = nullptr;
  44. // In the reference, population size = \lambda.
  45. // If the population size is zero, it is set to equation (48) of the reference
  46. // and rounded up to the nearest multiple of threads:
  47. size_t population_size = 0;
  48. // In the reference, learning_rate = c_m:
  49. DimensionlessReal learning_rate = 1;
  50. };
  51. template <typename ArgumentContainer>
  52. void validate_cma_es_parameters(cma_es_parameters<ArgumentContainer> &params) {
  53. using Real = typename ArgumentContainer::value_type;
  54. using DimensionlessReal = decltype(Real()/Real());
  55. using std::isfinite;
  56. using std::isnan;
  57. using std::log;
  58. using std::ceil;
  59. using std::floor;
  60. std::ostringstream oss;
  61. detail::validate_bounds(params.lower_bounds, params.upper_bounds);
  62. if (params.initial_guess) {
  63. detail::validate_initial_guess(*params.initial_guess, params.lower_bounds, params.upper_bounds);
  64. }
  65. const size_t n = params.upper_bounds.size();
  66. // Equation 48 of the arxiv review:
  67. if (params.population_size == 0) {
  68. //auto tmp = 4.0 + floor(3*log(n));
  69. // But round to the nearest multiple of the thread count:
  70. //auto k = static_cast<size_t>(std::ceil(tmp/params.threads));
  71. //params.population_size = k*params.threads;
  72. params.population_size = static_cast<size_t>(4 + floor(3*log(n)));
  73. }
  74. if (params.learning_rate <= DimensionlessReal(0) || !isfinite(params.learning_rate)) {
  75. oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
  76. oss << ": The learning rate must be > 0, but got " << params.learning_rate << ".";
  77. throw std::invalid_argument(oss.str());
  78. }
  79. }
  80. template <typename ArgumentContainer, class Func, class URBG>
  81. ArgumentContainer cma_es(
  82. const Func cost_function,
  83. cma_es_parameters<ArgumentContainer> &params,
  84. URBG &gen,
  85. std::invoke_result_t<Func, ArgumentContainer> target_value = std::numeric_limits<std::invoke_result_t<Func, ArgumentContainer>>::quiet_NaN(),
  86. std::atomic<bool> *cancellation = nullptr,
  87. std::atomic<std::invoke_result_t<Func, ArgumentContainer>> *current_minimum_cost = nullptr,
  88. std::vector<std::pair<ArgumentContainer, std::invoke_result_t<Func, ArgumentContainer>>> *queries = nullptr)
  89. {
  90. using Real = typename ArgumentContainer::value_type;
  91. using DimensionlessReal = decltype(Real()/Real());
  92. using ResultType = std::invoke_result_t<Func, ArgumentContainer>;
  93. using std::abs;
  94. using std::log;
  95. using std::exp;
  96. using std::pow;
  97. using std::min;
  98. using std::max;
  99. using std::sqrt;
  100. using std::isnan;
  101. using std::isfinite;
  102. using std::uniform_real_distribution;
  103. using std::normal_distribution;
  104. validate_cma_es_parameters(params);
  105. // n = dimension of problem:
  106. const size_t n = params.lower_bounds.size();
  107. std::atomic<bool> target_attained = false;
  108. std::atomic<ResultType> lowest_cost = std::numeric_limits<ResultType>::infinity();
  109. ArgumentContainer best_vector;
  110. // p_{c} := evolution path, equation (24) of the arxiv review:
  111. Eigen::Vector<DimensionlessReal, Eigen::Dynamic> p_c(n);
  112. // p_{\sigma} := conjugate evolution path, equation (31) of the arxiv review:
  113. Eigen::Vector<DimensionlessReal, Eigen::Dynamic> p_sigma(n);
  114. if constexpr (detail::has_resize_v<ArgumentContainer>) {
  115. best_vector.resize(n, std::numeric_limits<Real>::quiet_NaN());
  116. }
  117. for (size_t i = 0; i < n; ++i) {
  118. p_c[i] = DimensionlessReal(0);
  119. p_sigma[i] = DimensionlessReal(0);
  120. }
  121. // Table 1, \mu = floor(\lambda/2):
  122. size_t mu = params.population_size/2;
  123. std::vector<DimensionlessReal> w_prime(params.population_size, std::numeric_limits<DimensionlessReal>::quiet_NaN());
  124. for (size_t i = 0; i < params.population_size; ++i) {
  125. // Equation (49), but 0-indexed:
  126. w_prime[i] = log(static_cast<DimensionlessReal>(params.population_size + 1)/(2*(i+1)));
  127. }
  128. // Table 1, notes at top:
  129. DimensionlessReal positive_weight_sum = 0;
  130. DimensionlessReal sq_weight_sum = 0;
  131. for (size_t i = 0; i < mu; ++i) {
  132. BOOST_MATH_ASSERT(w_prime[i] > 0);
  133. positive_weight_sum += w_prime[i];
  134. sq_weight_sum += w_prime[i]*w_prime[i];
  135. }
  136. DimensionlessReal mu_eff = positive_weight_sum*positive_weight_sum/sq_weight_sum;
  137. BOOST_MATH_ASSERT(1 <= mu_eff);
  138. BOOST_MATH_ASSERT(mu_eff <= mu);
  139. DimensionlessReal negative_weight_sum = 0;
  140. sq_weight_sum = 0;
  141. for (size_t i = mu; i < params.population_size; ++i) {
  142. BOOST_MATH_ASSERT(w_prime[i] <= 0);
  143. negative_weight_sum += w_prime[i];
  144. sq_weight_sum += w_prime[i]*w_prime[i];
  145. }
  146. DimensionlessReal mu_eff_m = negative_weight_sum*negative_weight_sum/sq_weight_sum;
  147. // Equation (54):
  148. DimensionlessReal c_m = params.learning_rate;
  149. // Equation (55):
  150. DimensionlessReal c_sigma = (mu_eff + 2)/(n + mu_eff + 5);
  151. BOOST_MATH_ASSERT(c_sigma < 1);
  152. DimensionlessReal d_sigma = 1 + 2*(max)(DimensionlessReal(0), sqrt(DimensionlessReal((mu_eff - 1)/(n + 1))) - DimensionlessReal(1)) + c_sigma;
  153. // Equation (56):
  154. DimensionlessReal c_c = (4 + mu_eff/n)/(n + 4 + 2*mu_eff/n);
  155. BOOST_MATH_ASSERT(c_c <= 1);
  156. // Equation (57):
  157. DimensionlessReal c_1 = DimensionlessReal(2)/(pow(n + 1.3, 2) + mu_eff);
  158. // Equation (58)
  159. DimensionlessReal c_mu = (min)(1 - c_1, 2*(DimensionlessReal(0.25) + mu_eff + 1/mu_eff - 2)/((n+2)*(n+2) + mu_eff));
  160. BOOST_MATH_ASSERT(c_1 + c_mu <= DimensionlessReal(1));
  161. // Equation (50):
  162. DimensionlessReal alpha_mu_m = 1 + c_1/c_mu;
  163. // Equation (51):
  164. DimensionlessReal alpha_mu_eff_m = 1 + 2*mu_eff_m/(mu_eff + 2);
  165. // Equation (52):
  166. DimensionlessReal alpha_m_pos_def = (1- c_1 - c_mu)/(n*c_mu);
  167. // Equation (53):
  168. std::vector<DimensionlessReal> weights(params.population_size, std::numeric_limits<DimensionlessReal>::quiet_NaN());
  169. for (size_t i = 0; i < mu; ++i) {
  170. weights[i] = w_prime[i]/positive_weight_sum;
  171. }
  172. DimensionlessReal min_alpha = (min)(alpha_mu_m, (min)(alpha_mu_eff_m, alpha_m_pos_def));
  173. for (size_t i = mu; i < params.population_size; ++i) {
  174. weights[i] = min_alpha*w_prime[i]/abs(negative_weight_sum);
  175. }
  176. // mu:= number of parents, lambda := number of offspring.
  177. Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic> C = Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic>::Identity(n, n);
  178. ArgumentContainer mean_vector;
  179. // See the footnote in Figure 6 of the arxiv review:
  180. // We should consider the more robust initialization described there. . .
  181. Real sigma = DimensionlessReal(0.3)*(params.upper_bounds[0] - params.lower_bounds[0]);;
  182. if (params.initial_guess) {
  183. mean_vector = *params.initial_guess;
  184. }
  185. else {
  186. mean_vector = detail::random_initial_population(params.lower_bounds, params.upper_bounds, 1, gen)[0];
  187. }
  188. auto initial_cost = cost_function(mean_vector);
  189. if (!isnan(initial_cost)) {
  190. best_vector = mean_vector;
  191. lowest_cost = initial_cost;
  192. if (current_minimum_cost) {
  193. *current_minimum_cost = initial_cost;
  194. }
  195. }
  196. #if BOOST_MATH_DEBUG_CMA_ES
  197. {
  198. std::cout << __FILE__ << ":" << __LINE__ << ":" << __func__ << "\n";
  199. std::cout << "\tRunning a (" << params.population_size/2 << "/" << params.population_size/2 << "_W, " << params.population_size << ")-aCMA Evolutionary Strategy on " << params.threads << " threads.\n";
  200. std::cout << "\tInitial mean vector: {";
  201. for (size_t i = 0; i < n - 1; ++i) {
  202. std::cout << mean_vector[i] << ", ";
  203. }
  204. std::cout << mean_vector[n - 1] << "}.\n";
  205. std::cout << "\tCost: " << lowest_cost << ".\n";
  206. std::cout << "\tInitial step length: " << sigma << ".\n";
  207. std::cout << "\tVariance effective selection mass: " << mu_eff << ".\n";
  208. std::cout << "\tLearning rate for rank-one update of covariance matrix: " << c_1 << ".\n";
  209. std::cout << "\tLearning rate for rank-mu update of covariance matrix: " << c_mu << ".\n";
  210. std::cout << "\tDecay rate for cumulation path for step-size control: " << c_sigma << ".\n";
  211. std::cout << "\tLearning rate for the mean: " << c_m << ".\n";
  212. std::cout << "\tDamping parameter for step-size update: " << d_sigma << ".\n";
  213. }
  214. #endif
  215. size_t generation = 0;
  216. std::vector<Eigen::Vector<DimensionlessReal, Eigen::Dynamic>> ys(params.population_size);
  217. std::vector<ArgumentContainer> xs(params.population_size);
  218. std::vector<ResultType> costs(params.population_size, std::numeric_limits<ResultType>::quiet_NaN());
  219. Eigen::Vector<DimensionlessReal, Eigen::Dynamic> weighted_avg_y(n);
  220. Eigen::Vector<DimensionlessReal, Eigen::Dynamic> z(n);
  221. if constexpr (detail::has_resize_v<ArgumentContainer>) {
  222. for (auto & x : xs) {
  223. x.resize(n, std::numeric_limits<Real>::quiet_NaN());
  224. }
  225. }
  226. for (auto & y : ys) {
  227. y.resize(n);
  228. }
  229. normal_distribution<DimensionlessReal> dis(DimensionlessReal(0), DimensionlessReal(1));
  230. do {
  231. if (cancellation && *cancellation) {
  232. break;
  233. }
  234. // TODO: The reference contends the following in
  235. // Section B.2 "Strategy internal numerical effort":
  236. // "In practice, the re-calculation of B and D needs to be done not until about
  237. // max(1, floor(1/(10n(c_1+c_mu)))) generations."
  238. // Note that sigma can be dimensionless, in which case C carries the units.
  239. // This is a weird decision-we're not gonna do that!
  240. Eigen::SelfAdjointEigenSolver<Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic>> eigensolver(C);
  241. if (eigensolver.info() != Eigen::Success) {
  242. std::ostringstream oss;
  243. oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
  244. oss << ": Could not decompose the covariance matrix as BDB^{T}.";
  245. throw std::logic_error(oss.str());
  246. }
  247. Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic> B = eigensolver.eigenvectors();
  248. // Eigen returns D^2, in the notation of the survey:
  249. auto D = eigensolver.eigenvalues();
  250. // So make it better:
  251. for (auto & d : D) {
  252. if (d <= 0 || isnan(d)) {
  253. std::ostringstream oss;
  254. oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
  255. oss << ": The covariance matrix is not positive definite. This breaks the evolution path computation downstream.\n";
  256. oss << "C=\n" << C << "\n";
  257. oss << "Eigenvalues: " << D;
  258. throw std::domain_error(oss.str());
  259. }
  260. d = sqrt(d);
  261. }
  262. for (size_t k = 0; k < params.population_size; ++k) {
  263. auto & y = ys[k];
  264. auto & x = xs[k];
  265. BOOST_MATH_ASSERT(static_cast<size_t>(x.size()) == n);
  266. BOOST_MATH_ASSERT(static_cast<size_t>(y.size()) == n);
  267. size_t resample_counter = 0;
  268. do {
  269. // equation (39) of Figure 6:
  270. // Also see equation (4):
  271. for (size_t i = 0; i < n; ++i) {
  272. z[i] = dis(gen);
  273. }
  274. Eigen::Vector<DimensionlessReal, Eigen::Dynamic> Dz(n);
  275. for (size_t i = 0; i < n; ++i) {
  276. Dz[i] = D[i]*z[i];
  277. }
  278. y = B*Dz;
  279. for (size_t i = 0; i < n; ++i) {
  280. BOOST_MATH_ASSERT(!isnan(mean_vector[i]));
  281. BOOST_MATH_ASSERT(!isnan(y[i]));
  282. x[i] = mean_vector[i] + sigma*y[i]; // equation (40) of Figure 6.
  283. }
  284. costs[k] = cost_function(x);
  285. if (resample_counter++ == 50) {
  286. std::ostringstream oss;
  287. oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
  288. oss << ": 50 resamples was not sufficient to find an argument to the cost function which did not return NaN.";
  289. oss << " Giving up.";
  290. throw std::domain_error(oss.str());
  291. }
  292. } while (isnan(costs[k]));
  293. if (queries) {
  294. queries->emplace_back(std::make_pair(x, costs[k]));
  295. }
  296. if (costs[k] < lowest_cost) {
  297. lowest_cost = costs[k];
  298. best_vector = x;
  299. if (current_minimum_cost && costs[k] < *current_minimum_cost) {
  300. *current_minimum_cost = costs[k];
  301. }
  302. if (lowest_cost < target_value) {
  303. target_attained = true;
  304. break;
  305. }
  306. }
  307. }
  308. if (target_attained) {
  309. break;
  310. }
  311. if (cancellation && *cancellation) {
  312. break;
  313. }
  314. auto indices = detail::best_indices(costs);
  315. // Equation (41), Figure 6:
  316. for (size_t j = 0; j < n; ++j) {
  317. weighted_avg_y[j] = 0;
  318. for (size_t i = 0; i < mu; ++i) {
  319. BOOST_MATH_ASSERT(!isnan(weights[i]));
  320. BOOST_MATH_ASSERT(!isnan(ys[indices[i]][j]));
  321. weighted_avg_y[j] += weights[i]*ys[indices[i]][j];
  322. }
  323. }
  324. // Equation (42), Figure 6:
  325. for (size_t j = 0; j < n; ++j) {
  326. mean_vector[j] = mean_vector[j] + c_m*sigma*weighted_avg_y[j];
  327. }
  328. // Equation (43), Figure 6: Start with C^{-1/2}<y>_{w}
  329. Eigen::Vector<DimensionlessReal, Eigen::Dynamic> inv_D_B_transpose_y = B.transpose()*weighted_avg_y;
  330. for (long j = 0; j < inv_D_B_transpose_y.size(); ++j) {
  331. inv_D_B_transpose_y[j] /= D[j];
  332. }
  333. Eigen::Vector<DimensionlessReal, Eigen::Dynamic> C_inv_sqrt_y_avg = B*inv_D_B_transpose_y;
  334. // Equation (43), Figure 6:
  335. DimensionlessReal p_sigma_norm = 0;
  336. for (size_t j = 0; j < n; ++j) {
  337. p_sigma[j] = (1-c_sigma)*p_sigma[j] + sqrt(c_sigma*(2-c_sigma)*mu_eff)*C_inv_sqrt_y_avg[j];
  338. p_sigma_norm += p_sigma[j]*p_sigma[j];
  339. }
  340. p_sigma_norm = sqrt(p_sigma_norm);
  341. // A: Algorithm Summary: E[||N(0,1)||]:
  342. const DimensionlessReal expectation_norm_0I = sqrt(static_cast<DimensionlessReal>(n))*(DimensionlessReal(1) - DimensionlessReal(1)/(4*n) + DimensionlessReal(1)/(21*n*n));
  343. // Equation (44), Figure 6:
  344. sigma = sigma*exp(c_sigma*(p_sigma_norm/expectation_norm_0I -1)/d_sigma);
  345. // A: Algorithm Summary:
  346. DimensionlessReal h_sigma = 0;
  347. DimensionlessReal rhs = (DimensionlessReal(1.4) + DimensionlessReal(2)/(n+1))*expectation_norm_0I*sqrt(1 - pow(1-c_sigma, 2*(generation+1)));
  348. if (p_sigma_norm < rhs) {
  349. h_sigma = 1;
  350. }
  351. // Equation (45), Figure 6:
  352. p_c = (1-c_c)*p_c + h_sigma*sqrt(c_c*(2-c_c)*mu_eff)*weighted_avg_y;
  353. DimensionlessReal delta_h_sigma = (1-h_sigma)*c_c*(2-c_c);
  354. DimensionlessReal weight_sum = 0;
  355. for (auto & w : weights) {
  356. weight_sum += w;
  357. }
  358. // Equation (47), Figure 6:
  359. DimensionlessReal K = (1 + c_1*delta_h_sigma - c_1 - c_mu*weight_sum);
  360. // Can these operations be sped up using `.selfadjointView<Eigen::Upper>`?
  361. // Maybe: A.selfadjointView<Eigen::Lower>().rankUpdate(p_c, c_1);?
  362. C = K*C + c_1*p_c*p_c.transpose();
  363. // Incorporate positive weights of Equation (46):
  364. for (size_t i = 0; i < params.population_size/2; ++i) {
  365. C += c_mu*weights[i]*ys[indices[i]]*ys[indices[i]].transpose();
  366. }
  367. for (size_t i = params.population_size/2; i < params.population_size; ++i) {
  368. Eigen::Vector<DimensionlessReal, Eigen::Dynamic> D_inv_BTy = B.transpose()*ys[indices[i]];
  369. for (size_t j = 0; j < n; ++j) {
  370. D_inv_BTy[j] /= D[j];
  371. }
  372. DimensionlessReal squared_norm = D_inv_BTy.squaredNorm();
  373. DimensionlessReal K2 = c_mu*weights[i]/squared_norm;
  374. C += K2*ys[indices[i]]*ys[indices[i]].transpose();
  375. }
  376. } while (generation++ < params.max_generations);
  377. return best_vector;
  378. }
  379. } // namespace boost::math::optimization
  380. #endif