//---------------------------------------------------------------------------// // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> // // Distributed under the Boost Software License, Version 1.0 // See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt // // See http://boostorg.github.com/compute for more information. //---------------------------------------------------------------------------// #ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP #define BOOST_COMPUTE_ALGORITHM_COPY_HPP #include <algorithm> #include <iterator> #include <boost/utility/enable_if.hpp> #include <boost/mpl/and.hpp> #include <boost/mpl/not.hpp> #include <boost/mpl/or.hpp> #include <boost/compute/buffer.hpp> #include <boost/compute/system.hpp> #include <boost/compute/command_queue.hpp> #include <boost/compute/algorithm/detail/copy_on_device.hpp> #include <boost/compute/algorithm/detail/copy_to_device.hpp> #include <boost/compute/algorithm/detail/copy_to_host.hpp> #include <boost/compute/async/future.hpp> #include <boost/compute/container/mapped_view.hpp> #include <boost/compute/detail/device_ptr.hpp> #include <boost/compute/detail/is_contiguous_iterator.hpp> #include <boost/compute/detail/iterator_range_size.hpp> #include <boost/compute/detail/parameter_cache.hpp> #include <boost/compute/iterator/buffer_iterator.hpp> #include <boost/compute/type_traits/type_name.hpp> #include <boost/compute/type_traits/is_device_iterator.hpp> namespace boost { namespace compute { namespace detail { namespace mpl = boost::mpl; // meta-function returning true if copy() between InputIterator and // OutputIterator can be implemented with clEnqueueCopyBuffer(). template<class InputIterator, class OutputIterator> struct can_copy_with_copy_buffer : mpl::and_< mpl::or_< boost::is_same< InputIterator, buffer_iterator<typename InputIterator::value_type> >, boost::is_same< InputIterator, detail::device_ptr<typename InputIterator::value_type> > >, mpl::or_< boost::is_same< OutputIterator, buffer_iterator<typename OutputIterator::value_type> >, boost::is_same< OutputIterator, detail::device_ptr<typename OutputIterator::value_type> > >, boost::is_same< typename InputIterator::value_type, typename OutputIterator::value_type > >::type {}; // meta-function returning true if value_types of HostIterator and // DeviceIterator are same template<class HostIterator, class DeviceIterator> struct is_same_value_type : boost::is_same< typename boost::remove_cv< typename std::iterator_traits<HostIterator>::value_type >::type, typename boost::remove_cv< typename DeviceIterator::value_type >::type >::type {}; // meta-function returning true if value_type of HostIterator is bool template<class HostIterator> struct is_bool_value_type : boost::is_same< typename boost::remove_cv< typename std::iterator_traits<HostIterator>::value_type >::type, bool >::type {}; // host -> device (async) template<class InputIterator, class OutputIterator> inline future<OutputIterator> dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< mpl::not_< is_device_iterator<InputIterator> >, is_device_iterator<OutputIterator>, is_same_value_type<InputIterator, OutputIterator> > >::type* = 0) { BOOST_STATIC_ASSERT_MSG( is_contiguous_iterator<InputIterator>::value, "copy_async() is only supported for contiguous host iterators" ); return copy_to_device_async(first, last, result, queue); } // host -> device (async) // Type mismatch between InputIterator and OutputIterator value_types template<class InputIterator, class OutputIterator> inline future<OutputIterator> dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< mpl::not_< is_device_iterator<InputIterator> >, is_device_iterator<OutputIterator>, mpl::not_< is_same_value_type<InputIterator, OutputIterator> > > >::type* = 0) { BOOST_STATIC_ASSERT_MSG( is_contiguous_iterator<InputIterator>::value, "copy_async() is only supported for contiguous host iterators" ); typedef typename std::iterator_traits<InputIterator>::value_type input_type; const context &context = queue.get_context(); size_t count = iterator_range_size(first, last); if(count < size_t(1)) { return future<OutputIterator>(); } // map [first; last) to device and run copy kernel // on device for copying & casting ::boost::compute::mapped_view<input_type> mapped_host( // make sure it's a pointer to constant data // to force read only mapping const_cast<const input_type*>( ::boost::addressof(*first) ), count, context ); return copy_on_device_async( mapped_host.begin(), mapped_host.end(), result, queue ); } // host -> device // InputIterator is a contiguous iterator template<class InputIterator, class OutputIterator> inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< mpl::not_< is_device_iterator<InputIterator> >, is_device_iterator<OutputIterator>, is_same_value_type<InputIterator, OutputIterator>, is_contiguous_iterator<InputIterator> > >::type* = 0) { return copy_to_device(first, last, result, queue); } // host -> device // Type mismatch between InputIterator and OutputIterator value_types // InputIterator is a contiguous iterator template<class InputIterator, class OutputIterator> inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< mpl::not_< is_device_iterator<InputIterator> >, is_device_iterator<OutputIterator>, mpl::not_< is_same_value_type<InputIterator, OutputIterator> >, is_contiguous_iterator<InputIterator> > >::type* = 0) { typedef typename OutputIterator::value_type output_type; typedef typename std::iterator_traits<InputIterator>::value_type input_type; const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_compute_copy_to_device_") + type_name<input_type>() + "_" + type_name<output_type>(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); size_t map_copy_threshold; size_t direct_copy_threshold; // calculate default values of thresholds if (device.type() & device::gpu) { // GPUs map_copy_threshold = 524288; // 0.5 MB direct_copy_threshold = 52428800; // 50 MB } else { // CPUs and other devices map_copy_threshold = 134217728; // 128 MB direct_copy_threshold = 0; // it's never efficient for CPUs } // load thresholds map_copy_threshold = parameters->get( cache_key, "map_copy_threshold", map_copy_threshold ); direct_copy_threshold = parameters->get( cache_key, "direct_copy_threshold", direct_copy_threshold ); // select copy method based on thresholds & input_size_bytes size_t count = iterator_range_size(first, last); size_t input_size_bytes = count * sizeof(input_type); // [0; map_copy_threshold) -> copy_to_device_map() if(input_size_bytes < map_copy_threshold) { return copy_to_device_map(first, last, result, queue); } // [map_copy_threshold; direct_copy_threshold) -> convert [first; last) // on host and then perform copy_to_device() else if(input_size_bytes < direct_copy_threshold) { std::vector<output_type> vector(first, last); return copy_to_device(vector.begin(), vector.end(), result, queue); } // [direct_copy_threshold; inf) -> map [first; last) to device and // run copy kernel on device for copying & casting // At this point we are sure that count > 1 (first != last). // Perform async copy to device, wait for it to be finished and // return the result. // At this point we are sure that count > 1 (first != last), so event // returned by dispatch_copy_async() must be valid. return dispatch_copy_async(first, last, result, queue).get(); } // host -> device // InputIterator is NOT a contiguous iterator template<class InputIterator, class OutputIterator> inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< mpl::not_< is_device_iterator<InputIterator> >, is_device_iterator<OutputIterator>, mpl::not_< is_contiguous_iterator<InputIterator> > > >::type* = 0) { typedef typename OutputIterator::value_type output_type; typedef typename std::iterator_traits<InputIterator>::value_type input_type; const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_compute_copy_to_device_") + type_name<input_type>() + "_" + type_name<output_type>(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); size_t map_copy_threshold; size_t direct_copy_threshold; // calculate default values of thresholds if (device.type() & device::gpu) { // GPUs map_copy_threshold = 524288; // 0.5 MB direct_copy_threshold = 52428800; // 50 MB } else { // CPUs and other devices map_copy_threshold = 134217728; // 128 MB direct_copy_threshold = 0; // it's never efficient for CPUs } // load thresholds map_copy_threshold = parameters->get( cache_key, "map_copy_threshold", map_copy_threshold ); direct_copy_threshold = parameters->get( cache_key, "direct_copy_threshold", direct_copy_threshold ); // select copy method based on thresholds & input_size_bytes size_t input_size = iterator_range_size(first, last); size_t input_size_bytes = input_size * sizeof(input_type); // [0; map_copy_threshold) -> copy_to_device_map() // // if direct_copy_threshold is less than map_copy_threshold // copy_to_device_map() is used for every input if(input_size_bytes < map_copy_threshold || direct_copy_threshold <= map_copy_threshold) { return copy_to_device_map(first, last, result, queue); } // [map_copy_threshold; inf) -> convert [first; last) // on host and then perform copy_to_device() std::vector<output_type> vector(first, last); return copy_to_device(vector.begin(), vector.end(), result, queue); } // device -> host (async) template<class InputIterator, class OutputIterator> inline future<OutputIterator> dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, mpl::not_< is_device_iterator<OutputIterator> >, is_same_value_type<OutputIterator, InputIterator> > >::type* = 0) { BOOST_STATIC_ASSERT_MSG( is_contiguous_iterator<OutputIterator>::value, "copy_async() is only supported for contiguous host iterators" ); return copy_to_host_async(first, last, result, queue); } // device -> host (async) // Type mismatch between InputIterator and OutputIterator value_types template<class InputIterator, class OutputIterator> inline future<OutputIterator> dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, mpl::not_< is_device_iterator<OutputIterator> >, mpl::not_< is_same_value_type<OutputIterator, InputIterator> > > >::type* = 0) { BOOST_STATIC_ASSERT_MSG( is_contiguous_iterator<OutputIterator>::value, "copy_async() is only supported for contiguous host iterators" ); typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const context &context = queue.get_context(); size_t count = iterator_range_size(first, last); if(count < size_t(1)) { return future<OutputIterator>(); } // map host memory to device buffer mapped_host( context, count * sizeof(output_type), buffer::write_only | buffer::use_host_ptr, static_cast<void*>( ::boost::addressof(*result) ) ); // copy async on device ::boost::compute::future<buffer_iterator<output_type> > future = copy_on_device_async( first, last, make_buffer_iterator<output_type>(mapped_host), queue ); // update host memory asynchronously by maping and unmaping memory event map_event; void* ptr = queue.enqueue_map_buffer_async( mapped_host, CL_MAP_READ, 0, count * sizeof(output_type), map_event, future.get_event() ); event unmap_event = queue.enqueue_unmap_buffer(mapped_host, ptr, map_event); return make_future(result + count, unmap_event); } // device -> host // OutputIterator is a contiguous iterator template<class InputIterator, class OutputIterator> inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, mpl::not_< is_device_iterator<OutputIterator> >, is_same_value_type<OutputIterator, InputIterator>, is_contiguous_iterator<OutputIterator>, mpl::not_< is_bool_value_type<OutputIterator> > > >::type* = 0) { return copy_to_host(first, last, result, queue); } // device -> host // Type mismatch between InputIterator and OutputIterator value_types // OutputIterator is NOT a contiguous iterator or value_type of OutputIterator // is a boolean type. template<class InputIterator, class OutputIterator> inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, mpl::not_< is_device_iterator<OutputIterator> >, mpl::or_< mpl::not_< is_contiguous_iterator<OutputIterator> >, is_bool_value_type<OutputIterator> > > >::type* = 0) { typedef typename std::iterator_traits<OutputIterator>::value_type output_type; typedef typename InputIterator::value_type input_type; const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_compute_copy_to_host_") + type_name<input_type>() + "_" + type_name<output_type>(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); size_t map_copy_threshold; size_t direct_copy_threshold; // calculate default values of thresholds if (device.type() & device::gpu) { // GPUs map_copy_threshold = 33554432; // 30 MB direct_copy_threshold = 0; // it's never efficient for GPUs } else { // CPUs and other devices map_copy_threshold = 134217728; // 128 MB direct_copy_threshold = 0; // it's never efficient for CPUs } // load thresholds map_copy_threshold = parameters->get( cache_key, "map_copy_threshold", map_copy_threshold ); direct_copy_threshold = parameters->get( cache_key, "direct_copy_threshold", direct_copy_threshold ); // select copy method based on thresholds & input_size_bytes size_t count = iterator_range_size(first, last); size_t input_size_bytes = count * sizeof(input_type); // [0; map_copy_threshold) -> copy_to_host_map() // // if direct_copy_threshold is less than map_copy_threshold // copy_to_host_map() is used for every input if(input_size_bytes < map_copy_threshold || direct_copy_threshold <= map_copy_threshold) { return copy_to_host_map(first, last, result, queue); } // [map_copy_threshold; inf) -> copy [first;last) to temporary vector // then copy (and convert) to result using std::copy() std::vector<input_type> vector(count); copy_to_host(first, last, vector.begin(), queue); return std::copy(vector.begin(), vector.end(), result); } // device -> host // Type mismatch between InputIterator and OutputIterator value_types // OutputIterator is a contiguous iterator // value_type of OutputIterator is NOT a boolean type template<class InputIterator, class OutputIterator> inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, mpl::not_< is_device_iterator<OutputIterator> >, mpl::not_< is_same_value_type<OutputIterator, InputIterator> >, is_contiguous_iterator<OutputIterator>, mpl::not_< is_bool_value_type<OutputIterator> > > >::type* = 0) { typedef typename std::iterator_traits<OutputIterator>::value_type output_type; typedef typename InputIterator::value_type input_type; const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_compute_copy_to_host_") + type_name<input_type>() + "_" + type_name<output_type>(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); size_t map_copy_threshold; size_t direct_copy_threshold; // calculate default values of thresholds if (device.type() & device::gpu) { // GPUs map_copy_threshold = 524288; // 0.5 MB direct_copy_threshold = 52428800; // 50 MB } else { // CPUs and other devices map_copy_threshold = 134217728; // 128 MB direct_copy_threshold = 0; // it's never efficient for CPUs } // load thresholds map_copy_threshold = parameters->get( cache_key, "map_copy_threshold", map_copy_threshold ); direct_copy_threshold = parameters->get( cache_key, "direct_copy_threshold", direct_copy_threshold ); // select copy method based on thresholds & input_size_bytes size_t count = iterator_range_size(first, last); size_t input_size_bytes = count * sizeof(input_type); // [0; map_copy_threshold) -> copy_to_host_map() if(input_size_bytes < map_copy_threshold) { return copy_to_host_map(first, last, result, queue); } // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to // temporary vector then copy (and convert) to result using std::copy() else if(input_size_bytes < direct_copy_threshold) { std::vector<input_type> vector(count); copy_to_host(first, last, vector.begin(), queue); return std::copy(vector.begin(), vector.end(), result); } // [direct_copy_threshold; inf) -> map [result; result + input_size) to // device and run copy kernel on device for copying & casting // map host memory to device. // Perform async copy to host, wait for it to be finished and // return the result. // At this point we are sure that count > 1 (first != last), so event // returned by dispatch_copy_async() must be valid. return dispatch_copy_async(first, last, result, queue).get(); } // device -> device template<class InputIterator, class OutputIterator> inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, is_device_iterator<OutputIterator>, mpl::not_< can_copy_with_copy_buffer< InputIterator, OutputIterator > > > >::type* = 0) { return copy_on_device(first, last, result, queue); } // device -> device (specialization for buffer iterators) template<class InputIterator, class OutputIterator> inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, is_device_iterator<OutputIterator>, can_copy_with_copy_buffer< InputIterator, OutputIterator > > >::type* = 0) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; difference_type n = std::distance(first, last); if(n < 1){ // nothing to copy return result; } queue.enqueue_copy_buffer(first.get_buffer(), result.get_buffer(), first.get_index() * sizeof(value_type), result.get_index() * sizeof(value_type), static_cast<size_t>(n) * sizeof(value_type)); return result + n; } // device -> device (async) template<class InputIterator, class OutputIterator> inline future<OutputIterator> dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, is_device_iterator<OutputIterator>, mpl::not_< can_copy_with_copy_buffer< InputIterator, OutputIterator > > > >::type* = 0) { return copy_on_device_async(first, last, result, queue); } // device -> device (async, specialization for buffer iterators) template<class InputIterator, class OutputIterator> inline future<OutputIterator> dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, is_device_iterator<OutputIterator>, can_copy_with_copy_buffer< InputIterator, OutputIterator > > >::type* = 0) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; difference_type n = std::distance(first, last); if(n < 1){ // nothing to copy return make_future(result, event()); } event event_ = queue.enqueue_copy_buffer( first.get_buffer(), result.get_buffer(), first.get_index() * sizeof(value_type), result.get_index() * sizeof(value_type), static_cast<size_t>(n) * sizeof(value_type) ); return make_future(result + n, event_); } // host -> host template<class InputIterator, class OutputIterator> inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if_c< !is_device_iterator<InputIterator>::value && !is_device_iterator<OutputIterator>::value >::type* = 0) { (void) queue; return std::copy(first, last, result); } } // end detail namespace /// Copies the values in the range [\p first, \p last) to the range /// beginning at \p result. /// /// The generic copy() function can be used for a variety of data /// transfer tasks and provides a standard interface to the following /// OpenCL functions: /// /// \li \c clEnqueueReadBuffer() /// \li \c clEnqueueWriteBuffer() /// \li \c clEnqueueCopyBuffer() /// /// Unlike the aforementioned OpenCL functions, copy() will also work /// with non-contiguous data-structures (e.g. \c std::list<T>) as /// well as with "fancy" iterators (e.g. transform_iterator). /// /// \param first first element in the range to copy /// \param last last element in the range to copy /// \param result first element in the result range /// \param queue command queue to perform the operation /// /// \return \c OutputIterator to the end of the result range /// /// For example, to copy an array of \c int values on the host to a vector on /// the device: /// \code /// // array on the host /// int data[] = { 1, 2, 3, 4 }; /// /// // vector on the device /// boost::compute::vector<int> vec(4, context); /// /// // copy values to the device vector /// boost::compute::copy(data, data + 4, vec.begin(), queue); /// \endcode /// /// The copy algorithm can also be used with standard containers such as /// \c std::vector<T>: /// \code /// std::vector<int> host_vector = ... /// boost::compute::vector<int> device_vector = ... /// /// // copy from the host to the device /// boost::compute::copy( /// host_vector.begin(), host_vector.end(), device_vector.begin(), queue /// ); /// /// // copy from the device to the host /// boost::compute::copy( /// device_vector.begin(), device_vector.end(), host_vector.begin(), queue /// ); /// \endcode /// /// \see copy_n(), copy_if(), copy_async() template<class InputIterator, class OutputIterator> inline OutputIterator copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue = system::default_queue()) { return detail::dispatch_copy(first, last, result, queue); } /// Copies the values in the range [\p first, \p last) to the range /// beginning at \p result. The copy is performed asynchronously. /// /// \see copy() template<class InputIterator, class OutputIterator> inline future<OutputIterator> copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue = system::default_queue()) { return detail::dispatch_copy_async(first, last, result, queue); } } // end compute namespace } // end boost namespace #endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP