123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205 |
- #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
- #define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
- #include <iterator>
- #include <boost/compute/device.hpp>
- #include <boost/compute/kernel.hpp>
- #include <boost/compute/command_queue.hpp>
- #include <boost/compute/algorithm/detail/serial_scan.hpp>
- #include <boost/compute/detail/meta_kernel.hpp>
- #include <boost/compute/detail/iterator_range_size.hpp>
- #include <boost/compute/detail/parameter_cache.hpp>
- namespace boost {
- namespace compute {
- namespace detail {
- template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
- inline OutputIterator scan_on_cpu(InputIterator first,
- InputIterator last,
- OutputIterator result,
- bool exclusive,
- T init,
- BinaryOperator op,
- command_queue &queue)
- {
- typedef typename
- std::iterator_traits<InputIterator>::value_type input_type;
- typedef typename
- std::iterator_traits<OutputIterator>::value_type output_type;
- const context &context = queue.get_context();
- const device &device = queue.get_device();
- const size_t compute_units = queue.get_device().compute_units();
- boost::shared_ptr<parameter_cache> parameters =
- detail::parameter_cache::get_global_cache(device);
- std::string cache_key =
- "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
-
-
- uint_ serial_scan_threshold =
- parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
- serial_scan_threshold =
- (std::max)(serial_scan_threshold, uint_(compute_units));
- size_t count = detail::iterator_range_size(first, last);
- if(count == 0){
- return result;
- }
- else if(count < serial_scan_threshold) {
- return serial_scan(first, last, result, exclusive, init, op, queue);
- }
- buffer block_partial_sums(context, sizeof(output_type) * compute_units );
-
- meta_kernel k("scan_on_cpu_block_scan");
-
- size_t count_arg = k.add_arg<uint_>("count");
- size_t init_arg = k.add_arg<output_type>("initial_value");
- size_t block_partial_sums_arg =
- k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
- k <<
- "uint block = (count + get_global_size(0))/(get_global_size(0) + 1);\n" <<
- "uint index = get_global_id(0) * block;\n" <<
- "uint end = min(count, index + block);\n" <<
- "if(index >= end) return;\n";
- if(!exclusive){
- k <<
- k.decl<output_type>("sum") << " = " <<
- first[k.var<uint_>("index")] << ";\n" <<
- result[k.var<uint_>("index")] << " = sum;\n" <<
- "index++;\n";
- }
- else {
- k <<
- k.decl<output_type>("sum") << ";\n" <<
- "if(index == 0){\n" <<
- "sum = initial_value;\n" <<
- "}\n" <<
- "else {\n" <<
- "sum = " << first[k.var<uint_>("index")] << ";\n" <<
- "index++;\n" <<
- "}\n";
- }
- k <<
- "while(index < end){\n" <<
-
- k.decl<const input_type>("value") << " = "
- << first[k.var<uint_>("index")] << ";\n";
- if(exclusive){
- k <<
- "if(get_global_id(0) == 0){\n" <<
- result[k.var<uint_>("index")] << " = sum;\n" <<
- "}\n";
- }
- k <<
- "sum = " << op(k.var<output_type>("sum"),
- k.var<output_type>("value")) << ";\n";
- if(!exclusive){
- k <<
- "if(get_global_id(0) == 0){\n" <<
- result[k.var<uint_>("index")] << " = sum;\n" <<
- "}\n";
- }
- k <<
- "index++;\n" <<
- "}\n" <<
- "block_partial_sums[get_global_id(0)] = sum;\n";
-
- kernel block_scan_kernel = k.compile(context);
-
- block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
- block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
- block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
-
- size_t global_work_size = compute_units;
- queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);
-
- if(compute_units < 2) {
- return result + count;
- }
-
- meta_kernel l("scan_on_cpu_final_scan");
-
- count_arg = l.add_arg<uint_>("count");
- block_partial_sums_arg =
- l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
- l <<
- "uint block = (count + get_global_size(0))/(get_global_size(0) + 1);\n" <<
- "uint index = block + get_global_id(0) * block;\n" <<
- "uint end = min(count, index + block);\n" <<
- k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
- "for(uint i = 0; i < get_global_id(0); i++) {\n" <<
- "sum = " << op(k.var<output_type>("sum"),
- k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
- "}\n" <<
- "while(index < end){\n";
- if(exclusive){
- l <<
- l.decl<output_type>("value") << " = "
- << first[k.var<uint_>("index")] << ";\n" <<
- result[k.var<uint_>("index")] << " = sum;\n" <<
- "sum = " << op(k.var<output_type>("sum"),
- k.var<output_type>("value")) << ";\n";
- }
- else {
- l <<
- "sum = " << op(k.var<output_type>("sum"),
- first[k.var<uint_>("index")]) << ";\n" <<
- result[k.var<uint_>("index")] << " = sum;\n";
- }
- l <<
- "index++;\n" <<
- "}\n";
-
- kernel final_scan_kernel = l.compile(context);
-
- final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
- final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
-
- global_work_size = compute_units;
- queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);
-
- return result + count;
- }
- }
- }
- }
- #endif
|