diff options
author | Andrey Fedorov <andrey.fedorov@intel.com> | 2021-02-16 12:02:37 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-16 12:02:37 +0300 |
commit | 94189435085a09fcca528f6313b0006d179bbf00 (patch) | |
tree | e31c81155879cdb7bf15a44217098202c7957a02 | |
parent | Add FPGA_EMU tests (#118) (diff) | |
download | llvm-project-94189435085a09fcca528f6313b0006d179bbf00.tar.gz llvm-project-94189435085a09fcca528f6313b0006d179bbf00.tar.bz2 llvm-project-94189435085a09fcca528f6313b0006d179bbf00.zip |
Refactor generated names of kernels (#113)
* reworked kernel names
* more testing is added
* make shift_left_right run with unnamed lambdas
* some fixes for CI
* applied some suggestions from review
* addressed feedback from review
* cmake changes + kernel name for shift_left, shift_right algorithms
* make some tests compile
* remove unused variables
15 files changed, 252 insertions, 203 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index b2b86ccd19bc..d0e8c230964b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,7 +30,6 @@ if (FIND_GXX_EXE) execute_process(COMMAND ${FIND_GXX_EXE} -dumpfullversion OUTPUT_VARIABLE _onedpl_gxx_version) endif() -option(ONEDPL_USE_UNNAMED_LAMBDA "Pass -fsycl-unnamed-lambda compile option" OFF) option(ONEDPL_FPGA_STATIC_REPORT "Enable the static report generation for the FPGA device" OFF) option(ONEDPL_USE_AOT_COMPILATION "Enable the ahead of time compilation via OCLOC compiler" OFF) @@ -147,9 +146,17 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$") # DPC++ specific compiler options target_compile_options(oneDPL INTERFACE - $<$<BOOL:${ONEDPL_USE_UNNAMED_LAMBDA}>:-fsycl-unnamed-lambda> $<$<OR:$<BOOL:${ONEDPL_USE_DEVICE_FPGA_HW}>,$<BOOL:${ONEDPL_USE_DEVICE_FPGA_EMU}>>:-fintelfpga> ) + if (DEFINED ONEDPL_USE_UNNAMED_LAMBDA) + if(ONEDPL_USE_UNNAMED_LAMBDA) + message(STATUS "Use unnamed lambdas") + target_compile_options(oneDPL INTERFACE -fsycl-unnamed-lambda) + else() + message(STATUS "Don't use unnamed lambdas") + target_compile_options(oneDPL INTERFACE -fno-sycl-unnamed-lambda) + endif() + endif() # DPC++ specific macro target_compile_definitions(oneDPL INTERFACE diff --git a/cmake/README.md b/cmake/README.md index 93fac833f635..ad08fb0e69fd 100644 --- a/cmake/README.md +++ b/cmake/README.md @@ -10,7 +10,7 @@ The following variables are provided for oneDPL configuration: |------------------------------|--------|-----------------------------------------------------------------------------------------------|---------------| | ONEDPL_BACKEND | STRING | Threading backend; supported values: tbb, dpcpp, dpcpp_only, serial, ...; the default value is defined by compiler: dpcpp for DPC++ and tbb for others | tbb/dpcpp | | ONEDPL_DEVICE_TYPE | STRING | Device type, applicable only for sycl backends; supported values: GPU, CPU, FPGA_HW, FPGA_EMU | GPU | -| ONEDPL_USE_UNNAMED_LAMBDA | BOOL | Pass `-fsycl-unnamed-lambda` compile option | OFF | +| ONEDPL_USE_UNNAMED_LAMBDA | BOOL | Pass `-fsycl-unnamed-lambda`, `-fno-sycl-unnamed-lambda` compile options or nothing | | | ONEDPL_FPGA_STATIC_REPORT | BOOL | Enable the static report generation for the FPGA_HW device type | OFF | | ONEDPL_USE_AOT_COMPILATION | BOOL | Enable the ahead of time compilation via OpenCL™ Offline Compiler (OCLOC) | OFF | | ONEDPL_AOT_ARCH | STRING | Architecture options for the ahead of time compilation, supported values can be found [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html); the default value `*` means compilation for all available options | * | diff --git a/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h b/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h index 6f997d5c772d..44b6dbca381a 100644 --- a/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h +++ b/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h @@ -29,6 +29,12 @@ namespace dpl { namespace internal { + +template <typename Name> +class ExclusiveScan1; +template <typename Name> +class ExclusiveScan2; + template <typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename T, typename BinaryPredicate, typename Operator> oneapi::dpl::__internal::__enable_if_host_execution_policy<typename ::std::decay<Policy>::type, OutputIterator> @@ -67,7 +73,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter temp[0] = init; - typename internal::rebind_policy<policy_type, class ExclusiveScan1>::type policy1(policy); + typename internal::rebind_policy<policy_type, ExclusiveScan1<policy_type>>::type policy1(policy); // TODO : add stencil form of replace_copy_if to oneDPL if the // transform call here is difficult to understand and maintain. @@ -78,7 +84,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter replace_copy_if(policy1, first2, last2 - 1, _flags.get() + 1, _temp.get() + 1, ::std::negate<FlagType>(), init); #endif - typename internal::rebind_policy<policy_type, class ExclusiveScan2>::type policy2(policy); + typename internal::rebind_policy<policy_type, ExclusiveScan2<policy_type>>::type policy2(policy); // scan key-flag tuples inclusive_scan(policy2, make_zip_iterator(_temp.get(), _flags.get()), @@ -133,7 +139,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter temp[0] = init; } - typename internal::rebind_policy<policy_type, class ExclusiveScan1>::type policy1(policy); + typename internal::rebind_policy<policy_type, ExclusiveScan1<policy_type>>::type policy1(policy); // TODO : add stencil form of replace_copy_if to oneDPL if the // transform call here is difficult to understand and maintain. @@ -144,7 +150,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter replace_copy_if(policy1, first2, last2 - 1, _flags.get() + 1, _temp.get() + 1, ::std::negate<FlagType>(), init); # endif - typename internal::rebind_policy<policy_type, class ExclusiveScan2>::type policy2(policy); + typename internal::rebind_policy<policy_type, ExclusiveScan2<policy_type>>::type policy2(policy); // scan key-flag tuples transform_inclusive_scan(policy2, make_zip_iterator(_temp.get(), _flags.get()), diff --git a/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h b/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h index 59dfc5ccc233..5e2df863130e 100644 --- a/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h +++ b/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h @@ -30,6 +30,10 @@ namespace dpl { namespace internal { + +template <typename Name> +class InclusiveScan1; + template <typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename BinaryPredicate, typename BinaryOperator> oneapi::dpl::__internal::__enable_if_host_execution_policy<typename ::std::decay<Policy>::type, OutputIterator> @@ -58,7 +62,7 @@ inclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter transform(::std::forward<Policy>(policy), first1, last1 - 1, first1 + 1, _mask.get() + 1, ::std::not2(binary_pred)); - typename internal::rebind_policy<policy_type, class InclusiveScan1>::type policy1(policy); + typename internal::rebind_policy<policy_type, InclusiveScan1<policy_type>>::type policy1(policy); inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()), make_zip_iterator(first2, _mask.get()) + n, make_zip_iterator(result, _mask.get()), internal::segmented_scan_fun<ValueType, FlagType, BinaryOperator>(binary_op)); @@ -106,7 +110,7 @@ inclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter transform(::std::forward<Policy>(policy), first1, last1 - 1, first1 + 1, _mask.get() + 1, ::std::not2(binary_pred)); - typename internal::rebind_policy<policy_type, class InclusiveScan1>::type policy1(policy); + typename internal::rebind_policy<policy_type, InclusiveScan1<policy_type>>::type policy1(policy); transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()), make_zip_iterator(first2, _mask.get()) + n, make_zip_iterator(result, _mask.get()), internal::segmented_scan_fun<ValueType, FlagType, BinaryOperator>(binary_op), diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index 18e93b94199f..db36c8c9a621 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -30,6 +30,15 @@ namespace dpl namespace internal { +template <typename Name> +class Reduce1; +template <typename Name> +class Reduce2; +template <typename Name> +class Reduce3; +template <typename Name> +class Reduce4; + template <typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator1, typename OutputIterator2, typename BinaryPred, typename BinaryOperator> oneapi::dpl::__internal::__enable_if_host_execution_policy<typename ::std::decay<Policy>::type, @@ -87,7 +96,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la oneapi::dpl::__par_backend::__buffer<policy_type, FlagType> _scanned_tail_flags(n); // Compute the sum of the segments. scanned_tail_flags values are not used. - typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy); + typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy); inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()), make_zip_iterator(first2, _mask.get()) + n, make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()), internal::segmented_scan_fun<ValueType, FlagType, BinaryOperator>(binary_op)); @@ -95,7 +104,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la // for example: _scanned_values = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 } // Compute the indicies each segment sum should be written - typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy); + typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy); oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0), ::std::plus<CountType>()); @@ -108,7 +117,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la CountType N = scanned_tail_flags[n - 1] + 1; // scatter the keys and accumulated values - typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy); + typename internal::rebind_policy<policy_type, Reduce3<policy_type>>::type policy3(policy); oneapi::dpl::for_each(policy3, make_zip_iterator(first1, scanned_tail_flags, mask, scanned_values, mask + 1), make_zip_iterator(first1, scanned_tail_flags, mask, scanned_values, mask + 1) + n, internal::scatter_and_accumulate_fun<OutputIterator1, OutputIterator2>(result1, result2)); @@ -193,7 +202,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la internal::__buffer<policy_type, FlagType> _scanned_tail_flags(policy, n); // Compute the sum of the segments. scanned_tail_flags values are not used. - typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy); + typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy); transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()), make_zip_iterator(first2, _mask.get()) + n, make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()), @@ -203,7 +212,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la // for example: _scanned_values = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 } // Compute the indicies each segment sum should be written - typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy); + typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy); oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0), ::std::plus<CountType>()); @@ -222,8 +231,8 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la } // scatter the keys and accumulated values - typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy); - typename internal::rebind_policy<policy_type, class ReduceByKey4>::type policy4(policy); + typename internal::rebind_policy<policy_type, Reduce3<policy_type>>::type policy3(policy); + typename internal::rebind_policy<policy_type, Reduce4<policy_type>>::type policy4(policy); // permutation iterator reorders elements in result1 so the element at index // _scanned_tail_flags[i] is returned when index i of the iterator is accessed. The result @@ -320,7 +329,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la internal::__buffer<policy_type, FlagType> _scanned_tail_flags(policy, n); // Compute the sum of the segments. scanned_tail_flags values are not used. - typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy); + typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy); transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()), make_zip_iterator(first2, _mask.get()) + n, make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()), @@ -330,7 +339,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la // for example: _scanned_values = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 } // Compute the indicies each segment sum should be written - typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy); + typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy); oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0), ::std::plus<CountType>()); @@ -349,8 +358,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la } // scatter the keys and accumulated values - typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy); - typename internal::rebind_policy<policy_type, class ReduceByKey4>::type policy4(policy); + typename internal::rebind_policy<policy_type, Reduce4<policy_type>>::type policy4(policy); // result1 is a discard_iterator instance so we omit the write to it. @@ -440,7 +448,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la internal::__buffer<policy_type, FlagType> _scanned_tail_flags(policy, n); // Compute the sum of the segments. scanned_tail_flags values are not used. - typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy); + typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy); transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()), make_zip_iterator(first2, _mask.get()) + n, make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()), @@ -450,7 +458,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la // for example: _scanned_values = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 } // Compute the indicies each segment sum should be written - typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy); + typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy); oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0), ::std::plus<CountType>()); @@ -469,8 +477,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la } // scatter the keys and accumulated values - typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy); - typename internal::rebind_policy<policy_type, class ReduceByKey4>::type policy4(policy); + typename internal::rebind_policy<policy_type, Reduce3<policy_type>>::type policy3(policy); // permutation iterator reorders elements in result1 so the element at index // _scanned_tail_flags[i] is returned when index i of the iterator is accessed. The result diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index cdb001fa0043..eb41600c2ad6 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -1064,10 +1064,14 @@ __pattern_unique_copy(_ExecutionPolicy&& __exec, _Iterator1 __first, _Iterator1 return __result_first + __result.second; } -template <typename Name> +template <typename _Name> class copy_back_wrapper { }; +template <typename _Name> +class copy_back_wrapper2 +{ +}; template <typename _ExecutionPolicy, typename _Iterator, typename _Predicate> oneapi::dpl::__internal::__enable_if_hetero_execution_policy<_ExecutionPolicy, _Iterator> @@ -1247,12 +1251,16 @@ __pattern_merge(_ExecutionPolicy&& __exec, _Iterator1 __first1, _Iterator1 __las //To consider the direct copying pattern call in case just one of sequences is empty. if (__n1 == 0) oneapi::dpl::__internal::__pattern_walk2_brick( - ::std::forward<_ExecutionPolicy>(__exec), __first2, __last2, __d_first, - oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{}, ::std::true_type()); + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<copy_back_wrapper>( + ::std::forward<_ExecutionPolicy>(__exec)), + __first2, __last2, __d_first, oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{}, + ::std::true_type()); else if (__n2 == 0) oneapi::dpl::__internal::__pattern_walk2_brick( - ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __d_first, - oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{}, ::std::true_type()); + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<copy_back_wrapper2>( + ::std::forward<_ExecutionPolicy>(__exec)), + __first1, __last1, __d_first, oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{}, + ::std::true_type()); else { auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); @@ -1339,11 +1347,6 @@ __pattern_stable_sort(_ExecutionPolicy&& __exec, _Iterator __first, _Iterator __ .wait(); } -template <typename Name> -class copy_back_wrapper2 -{ -}; - template <typename _ExecutionPolicy, typename _Iterator, typename _UnaryPredicate> oneapi::dpl::__internal::__enable_if_hetero_execution_policy<_ExecutionPolicy, _Iterator> __pattern_stable_partition(_ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, _UnaryPredicate __pred, @@ -1902,9 +1905,11 @@ __pattern_set_union(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _Forw __comp, unseq_backend::_DifferenceTag()) - __buf; //2. Merge {1} and the difference - return oneapi::dpl::__internal::__pattern_merge(::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __buf, - __buf + __n_diff, __result, __comp, - /*vector=*/::std::true_type(), /*parallel=*/::std::true_type()); + return oneapi::dpl::__internal::__pattern_merge( + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__set_union_copy_case_2>( + ::std::forward<_ExecutionPolicy>(__exec)), + __first1, __last1, __buf, __buf + __n_diff, __result, __comp, + /*vector=*/::std::true_type(), /*parallel=*/::std::true_type()); } //Dummy names to avoid kernel problems @@ -1996,6 +2001,11 @@ __pattern_set_symmetric_difference(_ExecutionPolicy&& __exec, _ForwardIterator1 __comp, ::std::true_type(), ::std::true_type()); } +template <typename _Name> +class __shift_left_right +{ +}; + template <typename _ExecutionPolicy, typename _Range> oneapi::dpl::__internal::__enable_if_hetero_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__difference_t<_Range>> @@ -2027,7 +2037,10 @@ __pattern_shift_left(_ExecutionPolicy&& __exec, _Range __rng, oneapi::dpl::__int else //2. n < size/2; 'n' parallel copying { auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType>{__size, __n}; - oneapi::dpl::__par_backend_hetero::__parallel_for(::std::forward<_ExecutionPolicy>(__exec), __brick, __n, __rng) + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__shift_left_right>( + ::std::forward<_ExecutionPolicy>(__exec)), + __brick, __n, __rng) .wait(); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 9643fedcd336..4a9f7910da11 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -175,33 +175,6 @@ make_iter_mode(const _Iterator& __it) -> decltype(iter_mode<outMode>()(__it)) return iter_mode<outMode>()(__it); } -// function is needed to wrap kernel name into another class -template <template <typename> class _NewKernelName, typename _Policy, - oneapi::dpl::__internal::__enable_if_device_execution_policy<_Policy, int> = 0> -auto -make_wrapped_policy(_Policy&& __policy) - -> decltype(oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>( - ::std::forward<_Policy>(__policy))) -{ - return oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>( - ::std::forward<_Policy>(__policy)); -} - -#if _ONEDPL_FPGA_DEVICE -template <template <typename> class _NewKernelName, typename _Policy, - oneapi::dpl::__internal::__enable_if_fpga_execution_policy<_Policy, int> = 0> -auto -make_wrapped_policy(_Policy&& __policy) - -> decltype(oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor, - _NewKernelName<typename __decay_t<_Policy>::kernel_name>>( - ::std::forward<_Policy>(__policy))) -{ - return oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor, - _NewKernelName<typename __decay_t<_Policy>::kernel_name>>( - ::std::forward<_Policy>(__policy)); -} -#endif - // set of templated classes to name kernels template <typename _DerivedKernelName> class __kernel_name_base @@ -227,19 +200,19 @@ class __parallel_reduce_kernel : public __kernel_name_base<__parallel_reduce_ker { }; template <typename... _Name> -class __parallel_scan_kernel_1 : public __kernel_name_base<__parallel_scan_kernel_1<_Name...>> +class __parallel_scan_local_kernel : public __kernel_name_base<__parallel_scan_local_kernel<_Name...>> { }; template <typename... _Name> -class __parallel_scan_kernel_2 : public __kernel_name_base<__parallel_scan_kernel_2<_Name...>> +class __parallel_scan_global_kernel : public __kernel_name_base<__parallel_scan_global_kernel<_Name...>> { }; template <typename... _Name> -class __parallel_scan_kernel_3 : public __kernel_name_base<__parallel_scan_kernel_3<_Name...>> +class __parallel_scan_propagate_kernel : public __kernel_name_base<__parallel_scan_propagate_kernel<_Name...>> { }; template <typename... _Name> -class __parallel_find_or_kernel_1 : public __kernel_name_base<__parallel_find_or_kernel_1<_Name...>> +class __parallel_find_or_kernel : public __kernel_name_base<__parallel_find_or_kernel<_Name...>> { }; template <typename... _Name> @@ -247,15 +220,15 @@ class __parallel_merge_kernel : public __kernel_name_base<__parallel_merge_kerne { }; template <typename... _Name> -class __parallel_sort_kernel_1 : public __kernel_name_base<__parallel_sort_kernel_1<_Name...>> +class __parallel_sort_leaf_kernel : public __kernel_name_base<__parallel_sort_leaf_kernel<_Name...>> { }; template <typename... _Name> -class __parallel_sort_kernel_2 : public __kernel_name_base<__parallel_sort_kernel_2<_Name...>> +class __parallel_sort_global_kernel : public __kernel_name_base<__parallel_sort_global_kernel<_Name...>> { }; template <typename... _Name> -class __parallel_sort_kernel_3 : public __kernel_name_base<__parallel_sort_kernel_3<_Name...>> +class __parallel_sort_copy_back_kernel : public __kernel_name_base<__parallel_sort_copy_back_kernel<_Name...>> { }; @@ -272,19 +245,16 @@ __parallel_for(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&& assert(__get_first_range(::std::forward<_Ranges>(__rngs)...).size() > 0); using _Policy = typename ::std::decay<_ExecutionPolicy>::type; - using __kernel_name = typename _Policy::kernel_name; -#if __SYCL_UNNAMED_LAMBDA__ - using __kernel_name_t = __parallel_for_kernel<_Fp, __kernel_name, _Ranges...>; -#else - using __kernel_name_t = __parallel_for_kernel<__kernel_name>; -#endif + using _CustomName = typename _Policy::kernel_name; + using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_for_kernel, _CustomName, + _Fp, _Ranges...>; _PRINT_INFO_IN_DEBUG_MODE(__exec); auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) { //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - __cgh.parallel_for<__kernel_name_t>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) { + __cgh.parallel_for<_ForKernel>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) { auto __idx = __item_id.get_linear_id(); __brick(__idx, __rngs...); }); @@ -306,12 +276,10 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _ using _Size = decltype(__n); using _Policy = typename ::std::decay<_ExecutionPolicy>::type; - using __kernel_name = typename _Policy::kernel_name; -#if __SYCL_UNNAMED_LAMBDA__ - using __kernel_name_t = __parallel_reduce_kernel<_Up, _Cp, _Rp, __kernel_name, _Ranges...>; -#else - using __kernel_name_t = __parallel_reduce_kernel<__kernel_name>; -#endif + using _CustomName = typename _Policy::kernel_name; + using _ReduceKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_reduce_kernel, _CustomName, _Up, _Cp, + _Rp, _Ranges...>; sycl::cl_uint __max_compute_units = oneapi::dpl::__internal::__max_compute_units(__exec); ::std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec); @@ -319,7 +287,7 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _ __work_group_size = oneapi::dpl::__internal::__max_local_allocation_size<_ExecutionPolicy, _Tp>( ::std::forward<_ExecutionPolicy>(__exec), __work_group_size); #if _ONEDPL_COMPILE_KERNEL - sycl::kernel __kernel = __kernel_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); + sycl::kernel __kernel = _ReduceKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); __work_group_size = ::std::min(__work_group_size, oneapi::dpl::__internal::__kernel_work_group_size( ::std::forward<_ExecutionPolicy>(__exec), __kernel)); #endif @@ -354,7 +322,7 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _ auto __temp_acc = __temp.template get_access<access_mode::read_write>(__cgh); sycl::accessor<_Tp, 1, access_mode::read_write, sycl::access::target::local> __temp_local( sycl::range<1>(__work_group_size), __cgh); - __cgh.parallel_for<__kernel_name_t>( + __cgh.parallel_for<_ReduceKernel>( #if _ONEDPL_COMPILE_KERNEL __kernel, #endif @@ -404,21 +372,21 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& _InitType __init, _LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan) { using _Policy = typename ::std::decay<_ExecutionPolicy>::type; - using _KernelName = typename _Policy::kernel_name; + using _CustomName = typename _Policy::kernel_name; using _Type = typename _InitType::__value_type; -#if __SYCL_UNNAMED_LAMBDA__ - using __kernel_1_name_t = __parallel_scan_kernel_1<_Range1, _Range2, _BinaryOperation, _Type, _LocalScan, - _GroupScan, _GlobalScan, _KernelName>; - using __kernel_2_name_t = __parallel_scan_kernel_2<_Range1, _Range2, _BinaryOperation, _Type, _LocalScan, - _GroupScan, _GlobalScan, _KernelName>; - using __kernel_3_name_t = __parallel_scan_kernel_3<_Range1, _Range2, _BinaryOperation, _Type, _LocalScan, - _GroupScan, _GlobalScan, _KernelName>; -#else - using __kernel_1_name_t = __parallel_scan_kernel_1<_KernelName>; - using __kernel_2_name_t = __parallel_scan_kernel_2<_KernelName>; - using __kernel_3_name_t = __parallel_scan_kernel_3<_KernelName>; -#endif + using _LocalScanKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_scan_local_kernel, _CustomName, _Range1, + _Range2, _BinaryOperation, _Type, _LocalScan, + _GroupScan, _GlobalScan>; + using _GlobalScanKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_scan_global_kernel, _CustomName, + _Range1, _Range2, _BinaryOperation, _Type, + _LocalScan, _GroupScan, _GlobalScan>; + using _PropagateKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_scan_propagate_kernel, _CustomName, + _Range1, _Range2, _BinaryOperation, _Type, + _LocalScan, _GroupScan, _GlobalScan>; auto __n = __rng1.size(); assert(__n > 0); @@ -430,8 +398,8 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& ::std::forward<_ExecutionPolicy>(__exec), __wgroup_size); #if _ONEDPL_COMPILE_KERNEL - auto __kernel_1 = __kernel_1_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); - auto __kernel_2 = __kernel_2_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); + auto __kernel_1 = _LocalScanKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); + auto __kernel_2 = _GlobalScanKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); auto __wgroup_size_kernel_1 = oneapi::dpl::__internal::__kernel_work_group_size(::std::forward<_ExecutionPolicy>(__exec), __kernel_1); auto __wgroup_size_kernel_2 = @@ -455,7 +423,7 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& sycl::accessor<_Type, 1, access_mode::discard_read_write, sycl::access::target::local> __local_acc( __wgroup_size, __cgh); - __cgh.parallel_for<__kernel_1_name_t>( + __cgh.parallel_for<_LocalScanKernel>( #if _ONEDPL_COMPILE_KERNEL __kernel_1, #endif @@ -475,7 +443,7 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& sycl::accessor<_Type, 1, access_mode::discard_read_write, sycl::access::target::local> __local_acc( __wgroup_size, __cgh); - __cgh.parallel_for<__kernel_2_name_t>( + __cgh.parallel_for<_GlobalScanKernel>( #if _ONEDPL_COMPILE_KERNEL __kernel_2, #endif @@ -492,7 +460,7 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __cgh.depends_on(__submit_event); oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer auto __wg_sums_acc = __wg_sums.template get_access<access_mode::read>(__cgh); - __cgh.parallel_for<__kernel_3_name_t>(sycl::range<1>(__n_groups * __size_per_wg), [=](sycl::item<1> __item) { + __cgh.parallel_for<_PropagateKernel>(sycl::range<1>(__n_groups * __size_per_wg), [=](sycl::item<1> __item) { __global_scan(__item, __rng2, __rng1, __wg_sums_acc, __n, __size_per_wg); }); }); @@ -648,14 +616,10 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy< __parallel_find_or(_ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag, _Ranges&&... __rngs) { using _Policy = typename ::std::decay<_ExecutionPolicy>::type; - using __kernel_name = typename _Policy::kernel_name; + using _CustomName = typename _Policy::kernel_name; using _AtomicType = typename _BrickTag::_AtomicType; - -#if __SYCL_UNNAMED_LAMBDA__ - using __kernel_1_name_t = __parallel_find_or_kernel_1<_Ranges..., _Brick, __kernel_name>; -#else - using __kernel_1_name_t = __parallel_find_or_kernel_1<__kernel_name>; -#endif + using _FindOrKernel = oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_find_or_kernel, + _CustomName, _Brick, _Ranges...>; auto __or_tag_check = ::std::is_same<_BrickTag, __parallel_or_tag>{}; auto __rng_n = oneapi::dpl::__ranges::__get_first_range(::std::forward<_Ranges>(__rngs)...).size(); @@ -663,7 +627,7 @@ __parallel_find_or(_ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag, auto __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(::std::forward<_ExecutionPolicy>(__exec)); #if _ONEDPL_COMPILE_KERNEL - auto __kernel = __kernel_1_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); + auto __kernel = _FindOrKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); __wgroup_size = ::std::min(__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size( ::std::forward<_ExecutionPolicy>(__exec), __kernel)); #endif @@ -693,7 +657,7 @@ __parallel_find_or(_ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag, // create local accessor to connect atomic with sycl::accessor<_AtomicType, 1, access_mode::read_write, sycl::access::target::local> __temp_local(1, __cgh); - __cgh.parallel_for<__kernel_1_name_t>( + __cgh.parallel_for<_FindOrKernel>( #if _ONEDPL_COMPILE_KERNEL __kernel, #endif @@ -1036,12 +1000,10 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, _ __parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) { using _Policy = typename ::std::decay<_ExecutionPolicy>::type; - using __kernel_name = typename _Policy::kernel_name; -#if __SYCL_UNNAMED_LAMBDA__ - using __kernel_1_name_t = __parallel_merge_kernel<_Range1, _Range2, _Range3, _Compare, __kernel_name>; -#else - using __kernel_1_name_t = __parallel_merge_kernel<__kernel_name>; -#endif + using _CustomName = typename _Policy::kernel_name; + using _MergeKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_merge_kernel, _CustomName, _Range1, + _Range2, _Range3, _Compare>; auto __n = __rng1.size(); auto __n_2 = __rng2.size(); @@ -1055,7 +1017,7 @@ __parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<__kernel_1_name_t>(sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) { + __cgh.parallel_for<_MergeKernel>(sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) { __full_merge_kernel()(__item_id.get_linear_id() * __chunk, __rng1, decltype(__n)(0), __n, __rng2, decltype(__n_2)(0), __n_2, __rng3, decltype(__n)(0), __comp, __chunk); }); @@ -1095,16 +1057,16 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, _ __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, _Compare __comp) { using _Policy = typename ::std::decay<_ExecutionPolicy>::type; - using __kernel_name = typename _Policy::kernel_name; -#if __SYCL_UNNAMED_LAMBDA__ - using __kernel_1_name_t = __parallel_sort_kernel_1<_Range, _Merge, _Compare, __kernel_name>; - using __kernel_2_name_t = __parallel_sort_kernel_2<_Range, _Merge, _Compare, __kernel_name>; - using __kernel_3_name_t = __parallel_sort_kernel_3<_Range, _Merge, _Compare, __kernel_name>; -#else - using __kernel_1_name_t = __parallel_sort_kernel_1<__kernel_name>; - using __kernel_2_name_t = __parallel_sort_kernel_2<__kernel_name>; - using __kernel_3_name_t = __parallel_sort_kernel_3<__kernel_name>; -#endif + using _CustomName = typename _Policy::kernel_name; + using _LeafSortKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_leaf_kernel, _CustomName, _Range, + _Merge, _Compare>; + using _GlobalSortKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_global_kernel, _CustomName, _Range, + _Merge, _Compare>; + using _CopyBackKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_copy_back_kernel, _CustomName, + _Range, _Merge, _Compare>; using _Tp = oneapi::dpl::__internal::__value_t<_Range>; using _Size = oneapi::dpl::__internal::__difference_t<_Range>; @@ -1133,13 +1095,12 @@ __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, // 1. Perform sorting of the leaves of the merge sort tree sycl::event __event1 = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng); - __cgh.parallel_for<__kernel_1_name_t>(sycl::range</*dim=*/1>(__leaf_steps), - [=](sycl::item</*dim=*/1> __item_id) { - const _Size __idx = __item_id.get_linear_id() * __leaf; - const _Size __start = __idx; - const _Size __end = sycl::min(__start + __leaf, __n); - __leaf_sort_kernel()(__rng, __start, __end, __comp); - }); + __cgh.parallel_for<_LeafSortKernel>(sycl::range</*dim=*/1>(__leaf_steps), [=](sycl::item</*dim=*/1> __item_id) { + const _Size __idx = __item_id.get_linear_id() * __leaf; + const _Size __start = __idx; + const _Size __end = sycl::min(__start + __leaf, __n); + __leaf_sort_kernel()(__rng, __start, __end, __comp); + }); }); _Size __sorted = __leaf; @@ -1183,7 +1144,7 @@ __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, __cgh.depends_on(__event1); oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __temp_acc = __temp.template get_access<__par_backend_hetero::access_mode::read_write>(__cgh); - __cgh.parallel_for<__kernel_2_name_t>( + __cgh.parallel_for<_GlobalSortKernel>( sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) { const _Size __idx = __item_id.get_linear_id(); // Borders of the first and the second sorted sequences @@ -1221,7 +1182,7 @@ __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __temp_acc = __temp.template get_access<access_mode::read>(__cgh); // We cannot use __cgh.copy here because of zip_iterator usage - __cgh.parallel_for<__kernel_3_name_t>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) { + __cgh.parallel_for<_CopyBackKernel>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) { __rng[__item_id.get_linear_id()] = __temp_acc[__item_id]; }); }); @@ -1235,14 +1196,13 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, _ __parallel_partial_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, _Compare __comp) { using _Policy = typename ::std::decay<_ExecutionPolicy>::type; - using __kernel_name = typename _Policy::kernel_name; -#if __SYCL_UNNAMED_LAMBDA__ - using __kernel_1_name_t = __parallel_sort_kernel_1<_Range, _Merge, _Compare, __kernel_name>; - using __kernel_2_name_t = __parallel_sort_kernel_2<_Range, _Merge, _Compare, __kernel_name>; -#else - using __kernel_1_name_t = __parallel_sort_kernel_1<__kernel_name>; - using __kernel_2_name_t = __parallel_sort_kernel_2<__kernel_name>; -#endif + using _CustomName = typename _Policy::kernel_name; + using _GlobalSortKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_global_kernel, _CustomName, _Range, + _Merge, _Compare>; + using _CopyBackKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_copy_back_kernel, _CustomName, + _Range, _Merge, _Compare>; using _Tp = oneapi::dpl::__internal::__value_t<_Range>; using _Size = oneapi::dpl::__internal::__difference_t<_Range>; @@ -1262,7 +1222,7 @@ __parallel_partial_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge _ __cgh.depends_on(__event1); oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __temp_acc = __temp.template get_access<access_mode::read_write>(__cgh); - __cgh.parallel_for<__kernel_1_name_t>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) { + __cgh.parallel_for<_GlobalSortKernel>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) { auto __global_idx = __item_id.get_linear_id(); _Size __start = 2 * __k * (__global_idx / (2 * __k)); @@ -1293,7 +1253,7 @@ __parallel_partial_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge _ oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __temp_acc = __temp.template get_access<access_mode::read>(__cgh); // we cannot use __cgh.copy here because of zip_iterator usage - __cgh.parallel_for<__kernel_2_name_t>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) { + __cgh.parallel_for<_CopyBackKernel>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) { __rng[__item_id.get_linear_id()] = __temp_acc[__item_id]; }); }); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index 4915b1f0a396..13d21dd380b4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -55,6 +55,9 @@ class __radix_sort_reorder_kernel : public __kernel_name_base<__radix_sort_reord { }; +template <typename _Name> +class __odd_iteration; + //------------------------------------------------------------------------ // radix sort: ordered traits for a given size and integral/float flag //------------------------------------------------------------------------ @@ -562,30 +565,28 @@ __parallel_radix_sort_iteration(_ExecutionPolicy&& __exec, ::std::size_t __segme _InRange&& __in_rng, _OutRange&& __out_rng, _TmpBuf&& __tmp_buf, sycl::event __dependency_event) { - using _KernelName = typename __decay_t<_ExecutionPolicy>::kernel_name; -#if __SYCL_UNNAMED_LAMBDA__ - using __in_range_t = __decay_t<_InRange>; - using __out_range_t = __decay_t<_OutRange>; - using __tmp_buf_t = __decay_t<_TmpBuf>; - using __count_kernel_name = __radix_sort_count_kernel<__in_range_t, __tmp_buf_t, _KernelName>; - using __scan_kernel_name_1 = __radix_sort_scan_kernel_1<__tmp_buf_t, _KernelName>; - using __scan_kernel_name_2 = __radix_sort_scan_kernel_2<__tmp_buf_t, _KernelName>; - using __reorder_kernel_name = __radix_sort_reorder_kernel<__in_range_t, __out_range_t, ::std::size_t, _KernelName>; -#else - using __count_kernel_name = __radix_sort_count_kernel<_KernelName>; - using __scan_kernel_name_1 = __radix_sort_scan_kernel_1<_KernelName>; - using __scan_kernel_name_2 = __radix_sort_scan_kernel_2<_KernelName>; - using __reorder_kernel_name = __radix_sort_reorder_kernel<_KernelName>; -#endif + using _CustomName = typename __decay_t<_ExecutionPolicy>::kernel_name; + using _RadixCountKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_count_kernel, _CustomName, + __decay_t<_InRange>, __decay_t<_TmpBuf>>; + using _RadixLocalScanKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_scan_kernel_1, _CustomName, + __decay_t<_TmpBuf>>; + using _RadixGlobalScanKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_scan_kernel_2, _CustomName, + __decay_t<_TmpBuf>>; + using _RadixReorderKernel = + oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_reorder_kernel, _CustomName, + __decay_t<_InRange>, __decay_t<_OutRange>>; ::std::size_t __max_sg_size = oneapi::dpl::__internal::__max_sub_group_size(__exec); ::std::size_t __scan_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec); ::std::size_t __block_size = __max_sg_size; ::std::size_t __reorder_sg_size = __max_sg_size; #if _ONEDPL_COMPILE_KERNEL - auto __count_kernel = __count_kernel_name::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); - auto __scan_kernel_1 = __scan_kernel_name_1::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); - auto __reorder_kernel = __reorder_kernel_name::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); + auto __count_kernel = _RadixCountKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); + auto __scan_kernel_1 = _RadixLocalScanKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); + auto __reorder_kernel = _RadixReorderKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); ::std::size_t __count_sg_size = oneapi::dpl::__internal::__kernel_sub_group_size(__exec, __count_kernel); __reorder_sg_size = oneapi::dpl::__internal::__kernel_sub_group_size(__exec, __reorder_kernel); __block_size = sycl::max(__count_sg_size, __reorder_sg_size); @@ -596,7 +597,7 @@ __parallel_radix_sort_iteration(_ExecutionPolicy&& __exec, ::std::size_t __segme __block_size = __radix_states; // 1. Count Phase - sycl::event __count_event = __radix_sort_count_submit<__count_kernel_name, __radix_bits, __is_comp_asc>( + sycl::event __count_event = __radix_sort_count_submit<_RadixCountKernel, __radix_bits, __is_comp_asc>( __exec, __segments, __block_size, __radix_iter, __in_rng, __tmp_buf, __dependency_event #if _ONEDPL_COMPILE_KERNEL , @@ -605,11 +606,11 @@ __parallel_radix_sort_iteration(_ExecutionPolicy&& __exec, ::std::size_t __segme ); // 2. Scan Phase - sycl::event __scan_event = __radix_sort_scan_submit<__scan_kernel_name_1, __scan_kernel_name_2, __radix_bits>( + sycl::event __scan_event = __radix_sort_scan_submit<_RadixLocalScanKernel, _RadixGlobalScanKernel, __radix_bits>( __exec, __scan_wg_size, __segments, __tmp_buf, __count_event); // 3. Reorder Phase - sycl::event __reorder_event = __radix_sort_reorder_submit<__reorder_kernel_name, __radix_bits, __is_comp_asc>( + sycl::event __reorder_event = __radix_sort_reorder_submit<_RadixReorderKernel, __radix_bits, __is_comp_asc>( __exec, __segments, __block_size, __reorder_sg_size, __radix_iter, __in_rng, __out_rng, __tmp_buf, __scan_event #if _ONEDPL_COMPILE_KERNEL , @@ -662,10 +663,12 @@ __parallel_radix_sort(_ExecutionPolicy&& __exec, _Range&& __in_rng) // TODO: convert to ordered type once at the first iteration and convert back at the last one if (__radix_iter % 2 == 0) __iteration_event = __parallel_radix_sort_iteration<__radix_bits, __is_comp_asc>( - __exec, __segments, __radix_iter, __in_rng, __out_rng, __tmp_buf, __iteration_event); - else //swap __in_rng and__out_rng + ::std::forward<_ExecutionPolicy>(__exec), __segments, __radix_iter, __in_rng, __out_rng, __tmp_buf, + __iteration_event); + else //swap __in_rng and __out_rng __iteration_event = __parallel_radix_sort_iteration<__radix_bits, __is_comp_asc>( - __exec, __segments, __radix_iter, __out_rng, __in_rng, __tmp_buf, __iteration_event); + make_wrapped_policy<__odd_iteration>(::std::forward<_ExecutionPolicy>(__exec)), __segments, + __radix_iter, __out_rng, __in_rng, __tmp_buf, __iteration_event); // TODO: since reassign to __iteration_event does not work, we have to make explicit wait on the event explicit_wait_if<::std::is_pointer<decltype(__in_rng.begin())>::value>{}(__iteration_event); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 67892b8a7c1e..ee275c252881 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -82,8 +82,45 @@ struct explicit_wait_if<true> } }; +// function is needed to wrap kernel name into another class +template <template <typename> class _NewKernelName, typename _Policy, + oneapi::dpl::__internal::__enable_if_device_execution_policy<_Policy, int> = 0> +auto +make_wrapped_policy(_Policy&& __policy) + -> decltype(oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>( + ::std::forward<_Policy>(__policy))) +{ + return oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>( + ::std::forward<_Policy>(__policy)); +} + +#if _ONEDPL_FPGA_DEVICE +template <template <typename> class _NewKernelName, typename _Policy, + oneapi::dpl::__internal::__enable_if_fpga_execution_policy<_Policy, int> = 0> +auto +make_wrapped_policy(_Policy&& __policy) + -> decltype(oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor, + _NewKernelName<typename __decay_t<_Policy>::kernel_name>>( + ::std::forward<_Policy>(__policy))) +{ + return oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor, + _NewKernelName<typename __decay_t<_Policy>::kernel_name>>( + ::std::forward<_Policy>(__policy)); +} +#endif + namespace __internal { + +template <template <typename...> class _BaseName, typename _CustomName, typename... _Args> +using _KernelName_t = +#if __SYCL_UNNAMED_LAMBDA__ + typename std::conditional<std::is_same<_CustomName, oneapi::dpl::execution::DefaultKernelName>::value, + _BaseName<_CustomName, _Args...>, _BaseName<_CustomName>>::type; +#else + _BaseName<_CustomName>; +#endif + #if _ONEDPL_DEBUG_SYCL template <typename _Policy> inline void diff --git a/test/general/lambda_naming.pass.cpp b/test/general/lambda_naming.pass.cpp index 2b9b5c0a7d67..b3040432bc56 100644 --- a/test/general/lambda_naming.pass.cpp +++ b/test/general/lambda_naming.pass.cpp @@ -30,7 +30,7 @@ using namespace TestUtils; // This is the simple test for compilation only, to check if lambda naming works correctly int main() { -#if __SYCL_UNNAMED_LAMBDA__ && _ONEDPL_BACKEND_SYCL +#if _ONEDPL_BACKEND_SYCL const int n = 1000; sycl::buffer<int, 1> buf{ sycl::range<1>(n) }; sycl::buffer<int, 1> out_buf{ sycl::range<1>(n) }; @@ -44,10 +44,13 @@ int main() { sycl::noinit); #else sycl::property::noinit{}); -#endif +#endif // __cplusplus >= 201703L + ::std::fill(policy, buf_begin_discard_write, buf_begin_discard_write + n, 1); +#if __SYCL_UNNAMED_LAMBDA__ ::std::sort(policy, buf_begin, buf_end); ::std::for_each(policy, buf_begin, buf_end, [](int& x) { x += 41; }); + #if !_ONEDPL_FPGA_DEVICE ::std::inplace_merge(policy, buf_begin, buf_begin + n / 2, buf_end); auto red_val = ::std::reduce(policy, buf_begin, buf_end, 1); @@ -58,8 +61,15 @@ int main() { EXPECT_TRUE(!is_equal, "wrong return value from equal"); auto does_1_exist = ::std::find(policy, buf_begin, buf_end, 1); EXPECT_TRUE(does_1_exist - buf_begin == 1000, "wrong return value from find"); -#endif -#endif +#endif // !_ONEDPL_FPGA_DEVICE + +#else + // ::std::for_each(policy, buf_begin, buf_end, [](int& x) { x++; }); // It's not allowed. Policy with different name is needed + ::std::for_each(oneapi::dpl::execution::make_device_policy<class ForEach>(policy), buf_begin, buf_end, [](int& x) { x++; }); + auto red_val = ::std::reduce(policy, buf_begin, buf_end, 1); + EXPECT_TRUE(red_val == 2001, "wrong return value from reduce"); +#endif // __SYCL_UNNAMED_LAMBDA__ +#endif // _ONEDPL_BACKEND_SYCL ::std::cout << done() << ::std::endl; return 0; } diff --git a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp index 6384864b1963..f608f6aeb287 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp @@ -31,6 +31,9 @@ #define _PSTL_TEST_SHIFT_RIGHT #endif +template<typename Name> +struct USM; + struct test_shift { template <typename Policy, typename It, typename Algo> @@ -86,7 +89,7 @@ struct test_shift //copying data to USM buffer ::std::copy_n(first, m, ptr.get()); - auto het_res = algo(::std::forward<Policy>(exec), ptr.get(), ptr.get() + m, n); + auto het_res = algo(oneapi::dpl::execution::make_device_policy<USM<Algo>>(::std::forward<Policy>(exec)), ptr.get(), ptr.get() + m, n); res_idx = het_res - ptr.get(); //3.2 check result @@ -97,16 +100,6 @@ struct test_shift #endif }; -template <typename T, typename Size, typename Algo> -void -test_shift_by_type(Size m, Size n, Algo algo) -{ - TestUtils::Sequence<T> orig(m, [](::std::size_t v) -> T { return T(v); }); //fill data - TestUtils::Sequence<T> in(m, [](::std::size_t v) -> T { return T(v); }); //fill data - - TestUtils::invoke_on_all_policies<>()(test_shift(), in.begin(), m, orig.begin(), n, algo); -} - struct shift_left_algo { template <typename Policy, typename It> @@ -188,6 +181,21 @@ struct shift_right_algo } }; +template <typename T, typename Size> +void +test_shift_by_type(Size m, Size n) +{ + TestUtils::Sequence<T> orig(m, [](::std::size_t v) -> T { return T(v); }); //fill data + TestUtils::Sequence<T> in(m, [](::std::size_t v) -> T { return T(v); }); //fill data + +#ifdef _PSTL_TEST_SHIFT_LEFT + TestUtils::invoke_on_all_policies<0>()(test_shift(), in.begin(), m, orig.begin(), n, shift_left_algo{}); +#endif +#ifdef _PSTL_TEST_SHIFT_RIGHT + TestUtils::invoke_on_all_policies<1>()(test_shift(), in.begin(), m, orig.begin(), n, shift_right_algo{}); +#endif +} + int main() { @@ -195,13 +203,7 @@ main() for (long m = 0; m < N; m = m < 16 ? m + 1 : long(3.1415 * m)) for (long n = 0; n < N; n = n < 16 ? n + 1 : long(3.1415 * n)) { - //std::cout << "m: " << m << " n: " << n << std::endl; -#ifdef _PSTL_TEST_SHIFT_LEFT - test_shift_by_type<int32_t>(m, n, shift_left_algo{}); -#endif -#ifdef _PSTL_TEST_SHIFT_RIGHT - test_shift_by_type<int32_t>(m, n, shift_right_algo{}); -#endif + test_shift_by_type<int32_t>(m, n); } ::std::cout << TestUtils::done() << ::std::endl; diff --git a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp index 0a6609e51b54..8317e0bfa910 100644 --- a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp @@ -103,7 +103,7 @@ void test_with_usm() res_head[5] = 9; res_head[6] = 9; res_head[7] = 9; res_head[8] = 9; res_head[9] = 9; // call algorithm - auto new_policy = oneapi::dpl::execution::make_device_policy(q); + auto new_policy = oneapi::dpl::execution::make_device_policy<class exclusive_scan_by_segment_1>(q); oneapi::dpl::exclusive_scan_by_segment(new_policy, key_head, key_head+n, val_head, res_head, (uint64_t)0, std::equal_to<uint64_t>(), std::plus<uint64_t>()); q.wait(); @@ -120,7 +120,7 @@ void test_with_usm() // call algorithm on single element range res_head[0] = 9; - auto new_policy2 = oneapi::dpl::execution::make_device_policy(q); + auto new_policy2 = oneapi::dpl::execution::make_device_policy<class exclusive_scan_by_segment_2>(q); oneapi::dpl::exclusive_scan_by_segment(new_policy2, key_head, key_head+1, val_head, res_head, (uint64_t)0); diff --git a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp index 0c130857f894..c9ab5a00d8cf 100644 --- a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp @@ -102,7 +102,7 @@ void test_with_usm() res_head[5] = 9; res_head[6] = 9; res_head[7] = 9; res_head[8] = 9; res_head[9] = 9; // call algorithm - auto new_policy = oneapi::dpl::execution::make_device_policy(q); + auto new_policy = oneapi::dpl::execution::make_device_policy<class inclusive_scan_by_segment_1>(q); oneapi::dpl::inclusive_scan_by_segment(new_policy, key_head, key_head+n, val_head, res_head, std::equal_to<uint64_t>(), std::plus<uint64_t>()); @@ -118,7 +118,7 @@ void test_with_usm() // call algorithm on single element range res_head[0] = 9; - auto new_policy2 = oneapi::dpl::execution::make_device_policy(q); + auto new_policy2 = oneapi::dpl::execution::make_device_policy<class inclusive_scan_by_segment_2>(q); oneapi::dpl::inclusive_scan_by_segment(new_policy2, key_head, key_head+1, val_head, res_head); // check values diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp index 2d125d0a3b7d..96e48a104f37 100644 --- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp @@ -152,7 +152,7 @@ void test_with_usm() val_head[12] = 0; // call algorithm - auto new_policy = oneapi::dpl::execution::make_device_policy(q); + auto new_policy = oneapi::dpl::execution::make_device_policy<class reduce_by_segment_1>(q); auto res1 = oneapi::dpl::reduce_by_segment(new_policy, key_head, key_head + n, val_head, key_res_head, val_res_head); // check values @@ -179,7 +179,7 @@ void test_with_usm() key_res_head[0] = 9; val_res_head[0] = 9; - auto new_policy2 = oneapi::dpl::execution::make_device_policy(q); + auto new_policy2 = oneapi::dpl::execution::make_device_policy<class reduce_by_segment_2>(q); auto res2 = oneapi::dpl::reduce_by_segment(new_policy2, key_head, key_head + 1, val_head, key_res_head, val_res_head); // check values diff --git a/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp b/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp index 49ce643713d8..76fe8c246cc1 100644 --- a/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp +++ b/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp @@ -50,7 +50,7 @@ main() auto exec = TestUtils::default_dpcpp_policy; merge(exec, all_view(A), all_view(B), all_view<T, sycl::access::mode::write>(D)); - merge(exec, all_view(A), all_view(B), all_view<T, sycl::access::mode::write>(E), ::std::less<T>()); + merge(oneapi::dpl::execution::make_device_policy<class merge_2>(exec), all_view(A), all_view(B), all_view<T, sycl::access::mode::write>(E), ::std::less<T>()); } //check result |