123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
- #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
- #include <iterator>
- #include <boost/compute/types.hpp>
- #include <boost/compute/functional.hpp>
- #include <boost/compute/command_queue.hpp>
- #include <boost/compute/container/detail/scalar.hpp>
- #include <boost/compute/iterator/buffer_iterator.hpp>
- #include <boost/compute/type_traits/type_name.hpp>
- #include <boost/compute/detail/meta_kernel.hpp>
- #include <boost/compute/detail/iterator_range_size.hpp>
- #include <boost/compute/detail/parameter_cache.hpp>
- namespace boost {
- namespace compute {
- namespace detail {
- template<class InputIterator, class UnaryPredicate>
- inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
- InputIterator last,
- UnaryPredicate predicate,
- const size_t count,
- command_queue &queue)
- {
- typedef typename std::iterator_traits<InputIterator>::value_type value_type;
- typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
- const context &context = queue.get_context();
- detail::meta_kernel k("find_if");
- size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
- atomic_min<uint_> atomic_min_uint;
- k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
- << k.decl<const value_type>("value") << "="
- << first[k.var<const uint_>("i")] << ";\n"
- << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
- << " " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
- << "}\n";
- kernel kernel = k.compile(context);
- scalar<uint_> index(context);
- kernel.set_arg(index_arg, index.get_buffer());
-
- index.write(static_cast<uint_>(count), queue);
- queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
-
- return first + static_cast<difference_type>(index.read(queue));
- }
- template<class InputIterator, class UnaryPredicate>
- inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
- InputIterator last,
- UnaryPredicate predicate,
- const size_t count,
- const size_t vpt,
- command_queue &queue)
- {
- typedef typename std::iterator_traits<InputIterator>::value_type value_type;
- typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
- const context &context = queue.get_context();
- const device &device = queue.get_device();
- detail::meta_kernel k("find_if");
- size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
- size_t count_arg = k.add_arg<const uint_>("count");
- size_t vpt_arg = k.add_arg<const uint_>("vpt");
- atomic_min<uint_> atomic_min_uint;
-
- if(device.type() & device::gpu) {
- k <<
- k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
- k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
- k.decl<const uint_>("end") << " = min(" <<
- "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
- "count" <<
- ");\n" <<
-
- "__local uint local_index;\n" <<
- "if(get_local_id(0) == 0){\n" <<
- " local_index = *index;\n " <<
- "};\n" <<
- "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
- "if(local_index < id){\n" <<
- " return;\n" <<
- "}\n" <<
- "while(id < end){\n" <<
- " " << k.decl<const value_type>("value") << " = " <<
- first[k.var<const uint_>("id")] << ";\n"
- " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
- " " << atomic_min_uint(k.var<uint_ *>("index"),
- k.var<uint_>("id")) << ";\n" <<
- " return;\n"
- " }\n" <<
- " id+=lsize;\n" <<
- "}\n";
-
-
- } else {
- k <<
- k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
- k.decl<const uint_>("end") << " = min(" <<
- "id + " << k.var<uint_>("vpt") << "," <<
- "count" <<
- ");\n" <<
- "while(id < end && (*index) > id){\n" <<
- " " << k.decl<const value_type>("value") << " = " <<
- first[k.var<const uint_>("id")] << ";\n"
- " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
- " " << atomic_min_uint(k.var<uint_ *>("index"),
- k.var<uint_>("id")) << ";\n" <<
- " return;\n" <<
- " }\n" <<
- " id++;\n" <<
- "}\n";
- }
- kernel kernel = k.compile(context);
- scalar<uint_> index(context);
- kernel.set_arg(index_arg, index.get_buffer());
- kernel.set_arg(count_arg, static_cast<uint_>(count));
- kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
-
- index.write(static_cast<uint_>(count), queue);
- const size_t global_wg_size = static_cast<size_t>(
- std::ceil(float(count) / vpt)
- );
- queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
-
- return first + static_cast<difference_type>(index.read(queue));
- }
- template<class InputIterator, class UnaryPredicate>
- inline InputIterator find_if_with_atomics(InputIterator first,
- InputIterator last,
- UnaryPredicate predicate,
- command_queue &queue)
- {
- typedef typename std::iterator_traits<InputIterator>::value_type value_type;
- size_t count = detail::iterator_range_size(first, last);
- if(count == 0){
- return last;
- }
- const device &device = queue.get_device();
-
- std::string cache_key = std::string("__boost_find_if_with_atomics_")
- + type_name<value_type>();
- boost::shared_ptr<parameter_cache> parameters =
- detail::parameter_cache::get_global_cache(device);
-
-
- if(device.type() & device::gpu){
- const size_t one_vpt_threshold =
- parameters->get(cache_key, "one_vpt_threshold", 1048576);
- if(count <= one_vpt_threshold){
- return find_if_with_atomics_one_vpt(
- first, last, predicate, count, queue
- );
- }
- }
-
- size_t vpt;
- if(device.type() & device::gpu){
-
- vpt = parameters->get(cache_key, "vpt", 32);
- } else {
-
- const size_t max_compute_units =
- device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
- vpt = static_cast<size_t>(
- std::ceil(float(count) / max_compute_units)
- );
- }
- return find_if_with_atomics_multiple_vpt(
- first, last, predicate, count, vpt, queue
- );
- }
- }
- }
- }
- #endif
|