Refactor generated names of kernels (#113)

* reworked kernel names * more testing is added * make shift_left_right run with unnamed lambdas * some fixes for CI * applied some suggestions from review * addressed feedback from review * cmake changes + kernel name for shift_left, shift_right algorithms * make some tests compile * remove unused variables
author: Andrey Fedorov <andrey.fedorov@intel.com> 2021-02-16 12:02:37 +0300
committer: GitHub <noreply@github.com> 2021-02-16 12:02:37 +0300
commit: 94189435085a09fcca528f6313b0006d179bbf00 (patch)
tree: e31c81155879cdb7bf15a44217098202c7957a02
parent: Add FPGA_EMU tests (#118) (diff)
download: llvm-project-94189435085a09fcca528f6313b0006d179bbf00.tar.gz
llvm-project-94189435085a09fcca528f6313b0006d179bbf00.tar.bz2
llvm-project-94189435085a09fcca528f6313b0006d179bbf00.zip
15 files changed, 252 insertions, 203 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b2b86ccd19bc..d0e8c230964b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,6 @@ if (FIND_GXX_EXE)
     execute_process(COMMAND ${FIND_GXX_EXE} -dumpfullversion OUTPUT_VARIABLE _onedpl_gxx_version)
 endif()
 
-option(ONEDPL_USE_UNNAMED_LAMBDA "Pass -fsycl-unnamed-lambda compile option" OFF)
 option(ONEDPL_FPGA_STATIC_REPORT "Enable the static report generation for the FPGA device" OFF)
 option(ONEDPL_USE_AOT_COMPILATION "Enable the ahead of time compilation via OCLOC compiler" OFF)
 
@@ -147,9 +146,17 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$")
 
         # DPC++ specific compiler options
         target_compile_options(oneDPL INTERFACE
-            $<$<BOOL:${ONEDPL_USE_UNNAMED_LAMBDA}>:-fsycl-unnamed-lambda>
             $<$<OR:$<BOOL:${ONEDPL_USE_DEVICE_FPGA_HW}>,$<BOOL:${ONEDPL_USE_DEVICE_FPGA_EMU}>>:-fintelfpga>
             )
+        if (DEFINED ONEDPL_USE_UNNAMED_LAMBDA)
+            if(ONEDPL_USE_UNNAMED_LAMBDA)
+                message(STATUS "Use unnamed lambdas")
+                target_compile_options(oneDPL INTERFACE -fsycl-unnamed-lambda)
+            else()
+                message(STATUS "Don't use unnamed lambdas")
+                target_compile_options(oneDPL INTERFACE -fno-sycl-unnamed-lambda)
+            endif()
+        endif()
 
         # DPC++ specific macro
         target_compile_definitions(oneDPL INTERFACE
diff --git a/cmake/README.md b/cmake/README.md
index 93fac833f635..ad08fb0e69fd 100644
--- a/cmake/README.md
+++ b/cmake/README.md
@@ -10,7 +10,7 @@ The following variables are provided for oneDPL configuration:
 |------------------------------|--------|-----------------------------------------------------------------------------------------------|---------------|
 | ONEDPL_BACKEND               | STRING | Threading backend; supported values: tbb, dpcpp, dpcpp_only, serial, ...; the default value is defined by compiler: dpcpp for DPC++ and tbb for others | tbb/dpcpp |
 | ONEDPL_DEVICE_TYPE           | STRING | Device type, applicable only for sycl backends; supported values: GPU, CPU, FPGA_HW, FPGA_EMU | GPU           |
-| ONEDPL_USE_UNNAMED_LAMBDA    | BOOL   | Pass `-fsycl-unnamed-lambda` compile option                                                   | OFF           |
+| ONEDPL_USE_UNNAMED_LAMBDA    | BOOL   | Pass `-fsycl-unnamed-lambda`, `-fno-sycl-unnamed-lambda` compile options or nothing           |               |
 | ONEDPL_FPGA_STATIC_REPORT    | BOOL   | Enable the static report generation for the FPGA_HW device type                               | OFF           |
 | ONEDPL_USE_AOT_COMPILATION   | BOOL   | Enable the ahead of time compilation via OpenCL™ Offline Compiler (OCLOC)                     | OFF           |
 | ONEDPL_AOT_ARCH              | STRING | Architecture options for the ahead of time compilation, supported values can be found [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html); the default value `*` means compilation for all available options | *             |
diff --git a/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h b/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h
index 6f997d5c772d..44b6dbca381a 100644
--- a/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h
@@ -29,6 +29,12 @@ namespace dpl
 {
 namespace internal
 {
+
+template <typename Name>
+class ExclusiveScan1;
+template <typename Name>
+class ExclusiveScan2;
+
 template <typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename T,
           typename BinaryPredicate, typename Operator>
 oneapi::dpl::__internal::__enable_if_host_execution_policy<typename ::std::decay<Policy>::type, OutputIterator>
@@ -67,7 +73,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
 
     temp[0] = init;
 
-    typename internal::rebind_policy<policy_type, class ExclusiveScan1>::type policy1(policy);
+    typename internal::rebind_policy<policy_type, ExclusiveScan1<policy_type>>::type policy1(policy);
 
     // TODO : add stencil form of replace_copy_if to oneDPL if the
     // transform call here is difficult to understand and maintain.
@@ -78,7 +84,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
     replace_copy_if(policy1, first2, last2 - 1, _flags.get() + 1, _temp.get() + 1, ::std::negate<FlagType>(), init);
 #endif
 
-    typename internal::rebind_policy<policy_type, class ExclusiveScan2>::type policy2(policy);
+    typename internal::rebind_policy<policy_type, ExclusiveScan2<policy_type>>::type policy2(policy);
 
     // scan key-flag tuples
     inclusive_scan(policy2, make_zip_iterator(_temp.get(), _flags.get()),
@@ -133,7 +139,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
         temp[0] = init;
     }
 
-    typename internal::rebind_policy<policy_type, class ExclusiveScan1>::type policy1(policy);
+    typename internal::rebind_policy<policy_type, ExclusiveScan1<policy_type>>::type policy1(policy);
 
     // TODO : add stencil form of replace_copy_if to oneDPL if the
     // transform call here is difficult to understand and maintain.
@@ -144,7 +150,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
     replace_copy_if(policy1, first2, last2 - 1, _flags.get() + 1, _temp.get() + 1, ::std::negate<FlagType>(), init);
 #    endif
 
-    typename internal::rebind_policy<policy_type, class ExclusiveScan2>::type policy2(policy);
+    typename internal::rebind_policy<policy_type, ExclusiveScan2<policy_type>>::type policy2(policy);
 
     // scan key-flag tuples
     transform_inclusive_scan(policy2, make_zip_iterator(_temp.get(), _flags.get()),
diff --git a/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h b/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h
index 59dfc5ccc233..5e2df863130e 100644
--- a/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h
@@ -30,6 +30,10 @@ namespace dpl
 {
 namespace internal
 {
+
+template <typename Name>
+class InclusiveScan1;
+
 template <typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator,
           typename BinaryPredicate, typename BinaryOperator>
 oneapi::dpl::__internal::__enable_if_host_execution_policy<typename ::std::decay<Policy>::type, OutputIterator>
@@ -58,7 +62,7 @@ inclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
 
     transform(::std::forward<Policy>(policy), first1, last1 - 1, first1 + 1, _mask.get() + 1, ::std::not2(binary_pred));
 
-    typename internal::rebind_policy<policy_type, class InclusiveScan1>::type policy1(policy);
+    typename internal::rebind_policy<policy_type, InclusiveScan1<policy_type>>::type policy1(policy);
     inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()), make_zip_iterator(first2, _mask.get()) + n,
                    make_zip_iterator(result, _mask.get()),
                    internal::segmented_scan_fun<ValueType, FlagType, BinaryOperator>(binary_op));
@@ -106,7 +110,7 @@ inclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
 
     transform(::std::forward<Policy>(policy), first1, last1 - 1, first1 + 1, _mask.get() + 1, ::std::not2(binary_pred));
 
-    typename internal::rebind_policy<policy_type, class InclusiveScan1>::type policy1(policy);
+    typename internal::rebind_policy<policy_type, InclusiveScan1<policy_type>>::type policy1(policy);
     transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()),
                              make_zip_iterator(first2, _mask.get()) + n, make_zip_iterator(result, _mask.get()),
                              internal::segmented_scan_fun<ValueType, FlagType, BinaryOperator>(binary_op),
diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
index 18e93b94199f..db36c8c9a621 100644
--- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
@@ -30,6 +30,15 @@ namespace dpl
 namespace internal
 {
 
+template <typename Name>
+class Reduce1;
+template <typename Name>
+class Reduce2;
+template <typename Name>
+class Reduce3;
+template <typename Name>
+class Reduce4;
+
 template <typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator1,
           typename OutputIterator2, typename BinaryPred, typename BinaryOperator>
 oneapi::dpl::__internal::__enable_if_host_execution_policy<typename ::std::decay<Policy>::type,
@@ -87,7 +96,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     oneapi::dpl::__par_backend::__buffer<policy_type, FlagType> _scanned_tail_flags(n);
 
     // Compute the sum of the segments. scanned_tail_flags values are not used.
-    typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy);
+    typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy);
     inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()), make_zip_iterator(first2, _mask.get()) + n,
                    make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()),
                    internal::segmented_scan_fun<ValueType, FlagType, BinaryOperator>(binary_op));
@@ -95,7 +104,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     // for example: _scanned_values     = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 }
 
     // Compute the indicies each segment sum should be written
-    typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy);
+    typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy);
     oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0),
                                 ::std::plus<CountType>());
 
@@ -108,7 +117,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     CountType N = scanned_tail_flags[n - 1] + 1;
 
     // scatter the keys and accumulated values
-    typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy);
+    typename internal::rebind_policy<policy_type, Reduce3<policy_type>>::type policy3(policy);
     oneapi::dpl::for_each(policy3, make_zip_iterator(first1, scanned_tail_flags, mask, scanned_values, mask + 1),
                           make_zip_iterator(first1, scanned_tail_flags, mask, scanned_values, mask + 1) + n,
                           internal::scatter_and_accumulate_fun<OutputIterator1, OutputIterator2>(result1, result2));
@@ -193,7 +202,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     internal::__buffer<policy_type, FlagType> _scanned_tail_flags(policy, n);
 
     // Compute the sum of the segments. scanned_tail_flags values are not used.
-    typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy);
+    typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy);
     transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()),
                              make_zip_iterator(first2, _mask.get()) + n,
                              make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()),
@@ -203,7 +212,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     // for example: _scanned_values     = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 }
 
     // Compute the indicies each segment sum should be written
-    typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy);
+    typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy);
     oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0),
                                 ::std::plus<CountType>());
 
@@ -222,8 +231,8 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     }
 
     // scatter the keys and accumulated values
-    typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy);
-    typename internal::rebind_policy<policy_type, class ReduceByKey4>::type policy4(policy);
+    typename internal::rebind_policy<policy_type, Reduce3<policy_type>>::type policy3(policy);
+    typename internal::rebind_policy<policy_type, Reduce4<policy_type>>::type policy4(policy);
 
     // permutation iterator reorders elements in result1 so the element at index
     // _scanned_tail_flags[i] is returned when index i of the iterator is accessed. The result
@@ -320,7 +329,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     internal::__buffer<policy_type, FlagType> _scanned_tail_flags(policy, n);
 
     // Compute the sum of the segments. scanned_tail_flags values are not used.
-    typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy);
+    typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy);
     transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()),
                              make_zip_iterator(first2, _mask.get()) + n,
                              make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()),
@@ -330,7 +339,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     // for example: _scanned_values     = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 }
 
     // Compute the indicies each segment sum should be written
-    typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy);
+    typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy);
     oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0),
                                 ::std::plus<CountType>());
 
@@ -349,8 +358,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     }
 
     // scatter the keys and accumulated values
-    typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy);
-    typename internal::rebind_policy<policy_type, class ReduceByKey4>::type policy4(policy);
+    typename internal::rebind_policy<policy_type, Reduce4<policy_type>>::type policy4(policy);
 
     // result1 is a discard_iterator instance so we omit the write to it.
 
@@ -440,7 +448,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     internal::__buffer<policy_type, FlagType> _scanned_tail_flags(policy, n);
 
     // Compute the sum of the segments. scanned_tail_flags values are not used.
-    typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy);
+    typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy);
     transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()),
                              make_zip_iterator(first2, _mask.get()) + n,
                              make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()),
@@ -450,7 +458,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     // for example: _scanned_values     = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 }
 
     // Compute the indicies each segment sum should be written
-    typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy);
+    typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy);
     oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0),
                                 ::std::plus<CountType>());
 
@@ -469,8 +477,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
     }
 
     // scatter the keys and accumulated values
-    typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy);
-    typename internal::rebind_policy<policy_type, class ReduceByKey4>::type policy4(policy);
+    typename internal::rebind_policy<policy_type, Reduce3<policy_type>>::type policy3(policy);
 
     // permutation iterator reorders elements in result1 so the element at index
     // _scanned_tail_flags[i] is returned when index i of the iterator is accessed. The result
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index cdb001fa0043..eb41600c2ad6 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -1064,10 +1064,14 @@ __pattern_unique_copy(_ExecutionPolicy&& __exec, _Iterator1 __first, _Iterator1
     return __result_first + __result.second;
 }
 
-template <typename Name>
+template <typename _Name>
 class copy_back_wrapper
 {
 };
+template <typename _Name>
+class copy_back_wrapper2
+{
+};
 
 template <typename _ExecutionPolicy, typename _Iterator, typename _Predicate>
 oneapi::dpl::__internal::__enable_if_hetero_execution_policy<_ExecutionPolicy, _Iterator>
@@ -1247,12 +1251,16 @@ __pattern_merge(_ExecutionPolicy&& __exec, _Iterator1 __first1, _Iterator1 __las
     //To consider the direct copying pattern call in case just one of sequences is empty.
     if (__n1 == 0)
         oneapi::dpl::__internal::__pattern_walk2_brick(
-            ::std::forward<_ExecutionPolicy>(__exec), __first2, __last2, __d_first,
-            oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{}, ::std::true_type());
+            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<copy_back_wrapper>(
+                ::std::forward<_ExecutionPolicy>(__exec)),
+            __first2, __last2, __d_first, oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{},
+            ::std::true_type());
     else if (__n2 == 0)
         oneapi::dpl::__internal::__pattern_walk2_brick(
-            ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __d_first,
-            oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{}, ::std::true_type());
+            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<copy_back_wrapper2>(
+                ::std::forward<_ExecutionPolicy>(__exec)),
+            __first1, __last1, __d_first, oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{},
+            ::std::true_type());
     else
     {
         auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
@@ -1339,11 +1347,6 @@ __pattern_stable_sort(_ExecutionPolicy&& __exec, _Iterator __first, _Iterator __
         .wait();
 }
 
-template <typename Name>
-class copy_back_wrapper2
-{
-};
-
 template <typename _ExecutionPolicy, typename _Iterator, typename _UnaryPredicate>
 oneapi::dpl::__internal::__enable_if_hetero_execution_policy<_ExecutionPolicy, _Iterator>
 __pattern_stable_partition(_ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, _UnaryPredicate __pred,
@@ -1902,9 +1905,11 @@ __pattern_set_union(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _Forw
                                                                            __comp, unseq_backend::_DifferenceTag()) -
                           __buf;
     //2. Merge {1} and the difference
-    return oneapi::dpl::__internal::__pattern_merge(::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __buf,
-                                                    __buf + __n_diff, __result, __comp,
-                                                    /*vector=*/::std::true_type(), /*parallel=*/::std::true_type());
+    return oneapi::dpl::__internal::__pattern_merge(
+        oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__set_union_copy_case_2>(
+            ::std::forward<_ExecutionPolicy>(__exec)),
+        __first1, __last1, __buf, __buf + __n_diff, __result, __comp,
+        /*vector=*/::std::true_type(), /*parallel=*/::std::true_type());
 }
 
 //Dummy names to avoid kernel problems
@@ -1996,6 +2001,11 @@ __pattern_set_symmetric_difference(_ExecutionPolicy&& __exec, _ForwardIterator1
                                                     __comp, ::std::true_type(), ::std::true_type());
 }
 
+template <typename _Name>
+class __shift_left_right
+{
+};
+
 template <typename _ExecutionPolicy, typename _Range>
 oneapi::dpl::__internal::__enable_if_hetero_execution_policy<_ExecutionPolicy,
                                                              oneapi::dpl::__internal::__difference_t<_Range>>
@@ -2027,7 +2037,10 @@ __pattern_shift_left(_ExecutionPolicy&& __exec, _Range __rng, oneapi::dpl::__int
     else //2. n < size/2; 'n' parallel copying
     {
         auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType>{__size, __n};
-        oneapi::dpl::__par_backend_hetero::__parallel_for(::std::forward<_ExecutionPolicy>(__exec), __brick, __n, __rng)
+        oneapi::dpl::__par_backend_hetero::__parallel_for(
+            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__shift_left_right>(
+                ::std::forward<_ExecutionPolicy>(__exec)),
+            __brick, __n, __rng)
             .wait();
     }
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 9643fedcd336..4a9f7910da11 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -175,33 +175,6 @@ make_iter_mode(const _Iterator& __it) -> decltype(iter_mode<outMode>()(__it))
     return iter_mode<outMode>()(__it);
 }
 
-// function is needed to wrap kernel name into another class
-template <template <typename> class _NewKernelName, typename _Policy,
-          oneapi::dpl::__internal::__enable_if_device_execution_policy<_Policy, int> = 0>
-auto
-make_wrapped_policy(_Policy&& __policy)
-    -> decltype(oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
-        ::std::forward<_Policy>(__policy)))
-{
-    return oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
-        ::std::forward<_Policy>(__policy));
-}
-
-#if _ONEDPL_FPGA_DEVICE
-template <template <typename> class _NewKernelName, typename _Policy,
-          oneapi::dpl::__internal::__enable_if_fpga_execution_policy<_Policy, int> = 0>
-auto
-make_wrapped_policy(_Policy&& __policy)
-    -> decltype(oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor,
-                                                         _NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
-        ::std::forward<_Policy>(__policy)))
-{
-    return oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor,
-                                                    _NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
-        ::std::forward<_Policy>(__policy));
-}
-#endif
-
 // set of templated classes to name kernels
 template <typename _DerivedKernelName>
 class __kernel_name_base
@@ -227,19 +200,19 @@ class __parallel_reduce_kernel : public __kernel_name_base<__parallel_reduce_ker
 {
 };
 template <typename... _Name>
-class __parallel_scan_kernel_1 : public __kernel_name_base<__parallel_scan_kernel_1<_Name...>>
+class __parallel_scan_local_kernel : public __kernel_name_base<__parallel_scan_local_kernel<_Name...>>
 {
 };
 template <typename... _Name>
-class __parallel_scan_kernel_2 : public __kernel_name_base<__parallel_scan_kernel_2<_Name...>>
+class __parallel_scan_global_kernel : public __kernel_name_base<__parallel_scan_global_kernel<_Name...>>
 {
 };
 template <typename... _Name>
-class __parallel_scan_kernel_3 : public __kernel_name_base<__parallel_scan_kernel_3<_Name...>>
+class __parallel_scan_propagate_kernel : public __kernel_name_base<__parallel_scan_propagate_kernel<_Name...>>
 {
 };
 template <typename... _Name>
-class __parallel_find_or_kernel_1 : public __kernel_name_base<__parallel_find_or_kernel_1<_Name...>>
+class __parallel_find_or_kernel : public __kernel_name_base<__parallel_find_or_kernel<_Name...>>
 {
 };
 template <typename... _Name>
@@ -247,15 +220,15 @@ class __parallel_merge_kernel : public __kernel_name_base<__parallel_merge_kerne
 {
 };
 template <typename... _Name>
-class __parallel_sort_kernel_1 : public __kernel_name_base<__parallel_sort_kernel_1<_Name...>>
+class __parallel_sort_leaf_kernel : public __kernel_name_base<__parallel_sort_leaf_kernel<_Name...>>
 {
 };
 template <typename... _Name>
-class __parallel_sort_kernel_2 : public __kernel_name_base<__parallel_sort_kernel_2<_Name...>>
+class __parallel_sort_global_kernel : public __kernel_name_base<__parallel_sort_global_kernel<_Name...>>
 {
 };
 template <typename... _Name>
-class __parallel_sort_kernel_3 : public __kernel_name_base<__parallel_sort_kernel_3<_Name...>>
+class __parallel_sort_copy_back_kernel : public __kernel_name_base<__parallel_sort_copy_back_kernel<_Name...>>
 {
 };
 
@@ -272,19 +245,16 @@ __parallel_for(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&
     assert(__get_first_range(::std::forward<_Ranges>(__rngs)...).size() > 0);
 
     using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
-    using __kernel_name = typename _Policy::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
-    using __kernel_name_t = __parallel_for_kernel<_Fp, __kernel_name, _Ranges...>;
-#else
-    using __kernel_name_t = __parallel_for_kernel<__kernel_name>;
-#endif
+    using _CustomName = typename _Policy::kernel_name;
+    using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_for_kernel, _CustomName,
+                                                                                    _Fp, _Ranges...>;
 
     _PRINT_INFO_IN_DEBUG_MODE(__exec);
     auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
         //get an access to data under SYCL buffer:
         oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
 
-        __cgh.parallel_for<__kernel_name_t>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
+        __cgh.parallel_for<_ForKernel>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
             auto __idx = __item_id.get_linear_id();
             __brick(__idx, __rngs...);
         });
@@ -306,12 +276,10 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _
 
     using _Size = decltype(__n);
     using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
-    using __kernel_name = typename _Policy::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
-    using __kernel_name_t = __parallel_reduce_kernel<_Up, _Cp, _Rp, __kernel_name, _Ranges...>;
-#else
-    using __kernel_name_t = __parallel_reduce_kernel<__kernel_name>;
-#endif
+    using _CustomName = typename _Policy::kernel_name;
+    using _ReduceKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_reduce_kernel, _CustomName, _Up, _Cp,
+                                                                     _Rp, _Ranges...>;
 
     sycl::cl_uint __max_compute_units = oneapi::dpl::__internal::__max_compute_units(__exec);
     ::std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
@@ -319,7 +287,7 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _
     __work_group_size = oneapi::dpl::__internal::__max_local_allocation_size<_ExecutionPolicy, _Tp>(
         ::std::forward<_ExecutionPolicy>(__exec), __work_group_size);
 #if _ONEDPL_COMPILE_KERNEL
-    sycl::kernel __kernel = __kernel_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+    sycl::kernel __kernel = _ReduceKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
     __work_group_size = ::std::min(__work_group_size, oneapi::dpl::__internal::__kernel_work_group_size(
                                                           ::std::forward<_ExecutionPolicy>(__exec), __kernel));
 #endif
@@ -354,7 +322,7 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _
             auto __temp_acc = __temp.template get_access<access_mode::read_write>(__cgh);
             sycl::accessor<_Tp, 1, access_mode::read_write, sycl::access::target::local> __temp_local(
                 sycl::range<1>(__work_group_size), __cgh);
-            __cgh.parallel_for<__kernel_name_t>(
+            __cgh.parallel_for<_ReduceKernel>(
 #if _ONEDPL_COMPILE_KERNEL
                 __kernel,
 #endif
@@ -404,21 +372,21 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&&
                           _InitType __init, _LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan)
 {
     using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
-    using _KernelName = typename _Policy::kernel_name;
+    using _CustomName = typename _Policy::kernel_name;
     using _Type = typename _InitType::__value_type;
 
-#if __SYCL_UNNAMED_LAMBDA__
-    using __kernel_1_name_t = __parallel_scan_kernel_1<_Range1, _Range2, _BinaryOperation, _Type, _LocalScan,
-                                                       _GroupScan, _GlobalScan, _KernelName>;
-    using __kernel_2_name_t = __parallel_scan_kernel_2<_Range1, _Range2, _BinaryOperation, _Type, _LocalScan,
-                                                       _GroupScan, _GlobalScan, _KernelName>;
-    using __kernel_3_name_t = __parallel_scan_kernel_3<_Range1, _Range2, _BinaryOperation, _Type, _LocalScan,
-                                                       _GroupScan, _GlobalScan, _KernelName>;
-#else
-    using __kernel_1_name_t = __parallel_scan_kernel_1<_KernelName>;
-    using __kernel_2_name_t = __parallel_scan_kernel_2<_KernelName>;
-    using __kernel_3_name_t = __parallel_scan_kernel_3<_KernelName>;
-#endif
+    using _LocalScanKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_scan_local_kernel, _CustomName, _Range1,
+                                                                     _Range2, _BinaryOperation, _Type, _LocalScan,
+                                                                     _GroupScan, _GlobalScan>;
+    using _GlobalScanKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_scan_global_kernel, _CustomName,
+                                                                     _Range1, _Range2, _BinaryOperation, _Type,
+                                                                     _LocalScan, _GroupScan, _GlobalScan>;
+    using _PropagateKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_scan_propagate_kernel, _CustomName,
+                                                                     _Range1, _Range2, _BinaryOperation, _Type,
+                                                                     _LocalScan, _GroupScan, _GlobalScan>;
 
     auto __n = __rng1.size();
     assert(__n > 0);
@@ -430,8 +398,8 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&&
         ::std::forward<_ExecutionPolicy>(__exec), __wgroup_size);
 
 #if _ONEDPL_COMPILE_KERNEL
-    auto __kernel_1 = __kernel_1_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
-    auto __kernel_2 = __kernel_2_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+    auto __kernel_1 = _LocalScanKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+    auto __kernel_2 = _GlobalScanKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
     auto __wgroup_size_kernel_1 =
         oneapi::dpl::__internal::__kernel_work_group_size(::std::forward<_ExecutionPolicy>(__exec), __kernel_1);
     auto __wgroup_size_kernel_2 =
@@ -455,7 +423,7 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&&
         sycl::accessor<_Type, 1, access_mode::discard_read_write, sycl::access::target::local> __local_acc(
             __wgroup_size, __cgh);
 
-        __cgh.parallel_for<__kernel_1_name_t>(
+        __cgh.parallel_for<_LocalScanKernel>(
 #if _ONEDPL_COMPILE_KERNEL
             __kernel_1,
 #endif
@@ -475,7 +443,7 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&&
             sycl::accessor<_Type, 1, access_mode::discard_read_write, sycl::access::target::local> __local_acc(
                 __wgroup_size, __cgh);
 
-            __cgh.parallel_for<__kernel_2_name_t>(
+            __cgh.parallel_for<_GlobalScanKernel>(
 #if _ONEDPL_COMPILE_KERNEL
                 __kernel_2,
 #endif
@@ -492,7 +460,7 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&&
         __cgh.depends_on(__submit_event);
         oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer
         auto __wg_sums_acc = __wg_sums.template get_access<access_mode::read>(__cgh);
-        __cgh.parallel_for<__kernel_3_name_t>(sycl::range<1>(__n_groups * __size_per_wg), [=](sycl::item<1> __item) {
+        __cgh.parallel_for<_PropagateKernel>(sycl::range<1>(__n_groups * __size_per_wg), [=](sycl::item<1> __item) {
             __global_scan(__item, __rng2, __rng1, __wg_sums_acc, __n, __size_per_wg);
         });
     });
@@ -648,14 +616,10 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<
 __parallel_find_or(_ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag, _Ranges&&... __rngs)
 {
     using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
-    using __kernel_name = typename _Policy::kernel_name;
+    using _CustomName = typename _Policy::kernel_name;
     using _AtomicType = typename _BrickTag::_AtomicType;
-
-#if __SYCL_UNNAMED_LAMBDA__
-    using __kernel_1_name_t = __parallel_find_or_kernel_1<_Ranges..., _Brick, __kernel_name>;
-#else
-    using __kernel_1_name_t = __parallel_find_or_kernel_1<__kernel_name>;
-#endif
+    using _FindOrKernel = oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_find_or_kernel,
+                                                                                       _CustomName, _Brick, _Ranges...>;
 
     auto __or_tag_check = ::std::is_same<_BrickTag, __parallel_or_tag>{};
     auto __rng_n = oneapi::dpl::__ranges::__get_first_range(::std::forward<_Ranges>(__rngs)...).size();
@@ -663,7 +627,7 @@ __parallel_find_or(_ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag,
 
     auto __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(::std::forward<_ExecutionPolicy>(__exec));
 #if _ONEDPL_COMPILE_KERNEL
-    auto __kernel = __kernel_1_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+    auto __kernel = _FindOrKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
     __wgroup_size = ::std::min(__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(
                                                   ::std::forward<_ExecutionPolicy>(__exec), __kernel));
 #endif
@@ -693,7 +657,7 @@ __parallel_find_or(_ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag,
 
             // create local accessor to connect atomic with
             sycl::accessor<_AtomicType, 1, access_mode::read_write, sycl::access::target::local> __temp_local(1, __cgh);
-            __cgh.parallel_for<__kernel_1_name_t>(
+            __cgh.parallel_for<_FindOrKernel>(
 #if _ONEDPL_COMPILE_KERNEL
                 __kernel,
 #endif
@@ -1036,12 +1000,10 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, _
 __parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp)
 {
     using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
-    using __kernel_name = typename _Policy::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
-    using __kernel_1_name_t = __parallel_merge_kernel<_Range1, _Range2, _Range3, _Compare, __kernel_name>;
-#else
-    using __kernel_1_name_t = __parallel_merge_kernel<__kernel_name>;
-#endif
+    using _CustomName = typename _Policy::kernel_name;
+    using _MergeKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_merge_kernel, _CustomName, _Range1,
+                                                                     _Range2, _Range3, _Compare>;
     auto __n = __rng1.size();
     auto __n_2 = __rng2.size();
 
@@ -1055,7 +1017,7 @@ __parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
 
     auto __event = __exec.queue().submit([&](sycl::handler& __cgh) {
         oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3);
-        __cgh.parallel_for<__kernel_1_name_t>(sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
+        __cgh.parallel_for<_MergeKernel>(sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
             __full_merge_kernel()(__item_id.get_linear_id() * __chunk, __rng1, decltype(__n)(0), __n, __rng2,
                                   decltype(__n_2)(0), __n_2, __rng3, decltype(__n)(0), __comp, __chunk);
         });
@@ -1095,16 +1057,16 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, _
 __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, _Compare __comp)
 {
     using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
-    using __kernel_name = typename _Policy::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
-    using __kernel_1_name_t = __parallel_sort_kernel_1<_Range, _Merge, _Compare, __kernel_name>;
-    using __kernel_2_name_t = __parallel_sort_kernel_2<_Range, _Merge, _Compare, __kernel_name>;
-    using __kernel_3_name_t = __parallel_sort_kernel_3<_Range, _Merge, _Compare, __kernel_name>;
-#else
-    using __kernel_1_name_t = __parallel_sort_kernel_1<__kernel_name>;
-    using __kernel_2_name_t = __parallel_sort_kernel_2<__kernel_name>;
-    using __kernel_3_name_t = __parallel_sort_kernel_3<__kernel_name>;
-#endif
+    using _CustomName = typename _Policy::kernel_name;
+    using _LeafSortKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_leaf_kernel, _CustomName, _Range,
+                                                                     _Merge, _Compare>;
+    using _GlobalSortKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_global_kernel, _CustomName, _Range,
+                                                                     _Merge, _Compare>;
+    using _CopyBackKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_copy_back_kernel, _CustomName,
+                                                                     _Range, _Merge, _Compare>;
 
     using _Tp = oneapi::dpl::__internal::__value_t<_Range>;
     using _Size = oneapi::dpl::__internal::__difference_t<_Range>;
@@ -1133,13 +1095,12 @@ __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge,
     // 1. Perform sorting of the leaves of the merge sort tree
     sycl::event __event1 = __exec.queue().submit([&](sycl::handler& __cgh) {
         oneapi::dpl::__ranges::__require_access(__cgh, __rng);
-        __cgh.parallel_for<__kernel_1_name_t>(sycl::range</*dim=*/1>(__leaf_steps),
-                                              [=](sycl::item</*dim=*/1> __item_id) {
-                                                  const _Size __idx = __item_id.get_linear_id() * __leaf;
-                                                  const _Size __start = __idx;
-                                                  const _Size __end = sycl::min(__start + __leaf, __n);
-                                                  __leaf_sort_kernel()(__rng, __start, __end, __comp);
-                                              });
+        __cgh.parallel_for<_LeafSortKernel>(sycl::range</*dim=*/1>(__leaf_steps), [=](sycl::item</*dim=*/1> __item_id) {
+            const _Size __idx = __item_id.get_linear_id() * __leaf;
+            const _Size __start = __idx;
+            const _Size __end = sycl::min(__start + __leaf, __n);
+            __leaf_sort_kernel()(__rng, __start, __end, __comp);
+        });
     });
 
     _Size __sorted = __leaf;
@@ -1183,7 +1144,7 @@ __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge,
             __cgh.depends_on(__event1);
             oneapi::dpl::__ranges::__require_access(__cgh, __rng);
             auto __temp_acc = __temp.template get_access<__par_backend_hetero::access_mode::read_write>(__cgh);
-            __cgh.parallel_for<__kernel_2_name_t>(
+            __cgh.parallel_for<_GlobalSortKernel>(
                 sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
                     const _Size __idx = __item_id.get_linear_id();
                     // Borders of the first and the second sorted sequences
@@ -1221,7 +1182,7 @@ __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge,
             oneapi::dpl::__ranges::__require_access(__cgh, __rng);
             auto __temp_acc = __temp.template get_access<access_mode::read>(__cgh);
             // We cannot use __cgh.copy here because of zip_iterator usage
-            __cgh.parallel_for<__kernel_3_name_t>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
+            __cgh.parallel_for<_CopyBackKernel>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
                 __rng[__item_id.get_linear_id()] = __temp_acc[__item_id];
             });
         });
@@ -1235,14 +1196,13 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, _
 __parallel_partial_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, _Compare __comp)
 {
     using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
-    using __kernel_name = typename _Policy::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
-    using __kernel_1_name_t = __parallel_sort_kernel_1<_Range, _Merge, _Compare, __kernel_name>;
-    using __kernel_2_name_t = __parallel_sort_kernel_2<_Range, _Merge, _Compare, __kernel_name>;
-#else
-    using __kernel_1_name_t = __parallel_sort_kernel_1<__kernel_name>;
-    using __kernel_2_name_t = __parallel_sort_kernel_2<__kernel_name>;
-#endif
+    using _CustomName = typename _Policy::kernel_name;
+    using _GlobalSortKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_global_kernel, _CustomName, _Range,
+                                                                     _Merge, _Compare>;
+    using _CopyBackKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_copy_back_kernel, _CustomName,
+                                                                     _Range, _Merge, _Compare>;
 
     using _Tp = oneapi::dpl::__internal::__value_t<_Range>;
     using _Size = oneapi::dpl::__internal::__difference_t<_Range>;
@@ -1262,7 +1222,7 @@ __parallel_partial_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge _
             __cgh.depends_on(__event1);
             oneapi::dpl::__ranges::__require_access(__cgh, __rng);
             auto __temp_acc = __temp.template get_access<access_mode::read_write>(__cgh);
-            __cgh.parallel_for<__kernel_1_name_t>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
+            __cgh.parallel_for<_GlobalSortKernel>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
                 auto __global_idx = __item_id.get_linear_id();
 
                 _Size __start = 2 * __k * (__global_idx / (2 * __k));
@@ -1293,7 +1253,7 @@ __parallel_partial_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge _
             oneapi::dpl::__ranges::__require_access(__cgh, __rng);
             auto __temp_acc = __temp.template get_access<access_mode::read>(__cgh);
             // we cannot use __cgh.copy here because of zip_iterator usage
-            __cgh.parallel_for<__kernel_2_name_t>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
+            __cgh.parallel_for<_CopyBackKernel>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
                 __rng[__item_id.get_linear_id()] = __temp_acc[__item_id];
             });
         });
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
index 4915b1f0a396..13d21dd380b4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
@@ -55,6 +55,9 @@ class __radix_sort_reorder_kernel : public __kernel_name_base<__radix_sort_reord
 {
 };
 
+template <typename _Name>
+class __odd_iteration;
+
 //------------------------------------------------------------------------
 // radix sort: ordered traits for a given size and integral/float flag
 //------------------------------------------------------------------------
@@ -562,30 +565,28 @@ __parallel_radix_sort_iteration(_ExecutionPolicy&& __exec, ::std::size_t __segme
                                 _InRange&& __in_rng, _OutRange&& __out_rng, _TmpBuf&& __tmp_buf,
                                 sycl::event __dependency_event)
 {
-    using _KernelName = typename __decay_t<_ExecutionPolicy>::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
-    using __in_range_t = __decay_t<_InRange>;
-    using __out_range_t = __decay_t<_OutRange>;
-    using __tmp_buf_t = __decay_t<_TmpBuf>;
-    using __count_kernel_name = __radix_sort_count_kernel<__in_range_t, __tmp_buf_t, _KernelName>;
-    using __scan_kernel_name_1 = __radix_sort_scan_kernel_1<__tmp_buf_t, _KernelName>;
-    using __scan_kernel_name_2 = __radix_sort_scan_kernel_2<__tmp_buf_t, _KernelName>;
-    using __reorder_kernel_name = __radix_sort_reorder_kernel<__in_range_t, __out_range_t, ::std::size_t, _KernelName>;
-#else
-    using __count_kernel_name = __radix_sort_count_kernel<_KernelName>;
-    using __scan_kernel_name_1 = __radix_sort_scan_kernel_1<_KernelName>;
-    using __scan_kernel_name_2 = __radix_sort_scan_kernel_2<_KernelName>;
-    using __reorder_kernel_name = __radix_sort_reorder_kernel<_KernelName>;
-#endif
+    using _CustomName = typename __decay_t<_ExecutionPolicy>::kernel_name;
+    using _RadixCountKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_count_kernel, _CustomName,
+                                                                     __decay_t<_InRange>, __decay_t<_TmpBuf>>;
+    using _RadixLocalScanKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_scan_kernel_1, _CustomName,
+                                                                     __decay_t<_TmpBuf>>;
+    using _RadixGlobalScanKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_scan_kernel_2, _CustomName,
+                                                                     __decay_t<_TmpBuf>>;
+    using _RadixReorderKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_reorder_kernel, _CustomName,
+                                                                     __decay_t<_InRange>, __decay_t<_OutRange>>;
 
     ::std::size_t __max_sg_size = oneapi::dpl::__internal::__max_sub_group_size(__exec);
     ::std::size_t __scan_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
     ::std::size_t __block_size = __max_sg_size;
     ::std::size_t __reorder_sg_size = __max_sg_size;
 #if _ONEDPL_COMPILE_KERNEL
-    auto __count_kernel = __count_kernel_name::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
-    auto __scan_kernel_1 = __scan_kernel_name_1::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
-    auto __reorder_kernel = __reorder_kernel_name::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+    auto __count_kernel = _RadixCountKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+    auto __scan_kernel_1 = _RadixLocalScanKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+    auto __reorder_kernel = _RadixReorderKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
     ::std::size_t __count_sg_size = oneapi::dpl::__internal::__kernel_sub_group_size(__exec, __count_kernel);
     __reorder_sg_size = oneapi::dpl::__internal::__kernel_sub_group_size(__exec, __reorder_kernel);
     __block_size = sycl::max(__count_sg_size, __reorder_sg_size);
@@ -596,7 +597,7 @@ __parallel_radix_sort_iteration(_ExecutionPolicy&& __exec, ::std::size_t __segme
         __block_size = __radix_states;
 
     // 1. Count Phase
-    sycl::event __count_event = __radix_sort_count_submit<__count_kernel_name, __radix_bits, __is_comp_asc>(
+    sycl::event __count_event = __radix_sort_count_submit<_RadixCountKernel, __radix_bits, __is_comp_asc>(
         __exec, __segments, __block_size, __radix_iter, __in_rng, __tmp_buf, __dependency_event
 #if _ONEDPL_COMPILE_KERNEL
         ,
@@ -605,11 +606,11 @@ __parallel_radix_sort_iteration(_ExecutionPolicy&& __exec, ::std::size_t __segme
     );
 
     // 2. Scan Phase
-    sycl::event __scan_event = __radix_sort_scan_submit<__scan_kernel_name_1, __scan_kernel_name_2, __radix_bits>(
+    sycl::event __scan_event = __radix_sort_scan_submit<_RadixLocalScanKernel, _RadixGlobalScanKernel, __radix_bits>(
         __exec, __scan_wg_size, __segments, __tmp_buf, __count_event);
 
     // 3. Reorder Phase
-    sycl::event __reorder_event = __radix_sort_reorder_submit<__reorder_kernel_name, __radix_bits, __is_comp_asc>(
+    sycl::event __reorder_event = __radix_sort_reorder_submit<_RadixReorderKernel, __radix_bits, __is_comp_asc>(
         __exec, __segments, __block_size, __reorder_sg_size, __radix_iter, __in_rng, __out_rng, __tmp_buf, __scan_event
 #if _ONEDPL_COMPILE_KERNEL
         ,
@@ -662,10 +663,12 @@ __parallel_radix_sort(_ExecutionPolicy&& __exec, _Range&& __in_rng)
         // TODO: convert to ordered type once at the first iteration and convert back at the last one
         if (__radix_iter % 2 == 0)
             __iteration_event = __parallel_radix_sort_iteration<__radix_bits, __is_comp_asc>(
-                __exec, __segments, __radix_iter, __in_rng, __out_rng, __tmp_buf, __iteration_event);
-        else //swap __in_rng and__out_rng
+                ::std::forward<_ExecutionPolicy>(__exec), __segments, __radix_iter, __in_rng, __out_rng, __tmp_buf,
+                __iteration_event);
+        else //swap __in_rng and __out_rng
             __iteration_event = __parallel_radix_sort_iteration<__radix_bits, __is_comp_asc>(
-                __exec, __segments, __radix_iter, __out_rng, __in_rng, __tmp_buf, __iteration_event);
+                make_wrapped_policy<__odd_iteration>(::std::forward<_ExecutionPolicy>(__exec)), __segments,
+                __radix_iter, __out_rng, __in_rng, __tmp_buf, __iteration_event);
 
         // TODO: since reassign to __iteration_event does not work, we have to make explicit wait on the event
         explicit_wait_if<::std::is_pointer<decltype(__in_rng.begin())>::value>{}(__iteration_event);
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 67892b8a7c1e..ee275c252881 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -82,8 +82,45 @@ struct explicit_wait_if<true>
     }
 };
 
+// function is needed to wrap kernel name into another class
+template <template <typename> class _NewKernelName, typename _Policy,
+          oneapi::dpl::__internal::__enable_if_device_execution_policy<_Policy, int> = 0>
+auto
+make_wrapped_policy(_Policy&& __policy)
+    -> decltype(oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
+        ::std::forward<_Policy>(__policy)))
+{
+    return oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
+        ::std::forward<_Policy>(__policy));
+}
+
+#if _ONEDPL_FPGA_DEVICE
+template <template <typename> class _NewKernelName, typename _Policy,
+          oneapi::dpl::__internal::__enable_if_fpga_execution_policy<_Policy, int> = 0>
+auto
+make_wrapped_policy(_Policy&& __policy)
+    -> decltype(oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor,
+                                                         _NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
+        ::std::forward<_Policy>(__policy)))
+{
+    return oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor,
+                                                    _NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
+        ::std::forward<_Policy>(__policy));
+}
+#endif
+
 namespace __internal
 {
+
+template <template <typename...> class _BaseName, typename _CustomName, typename... _Args>
+using _KernelName_t =
+#if __SYCL_UNNAMED_LAMBDA__
+    typename std::conditional<std::is_same<_CustomName, oneapi::dpl::execution::DefaultKernelName>::value,
+                              _BaseName<_CustomName, _Args...>, _BaseName<_CustomName>>::type;
+#else
+    _BaseName<_CustomName>;
+#endif
+
 #if _ONEDPL_DEBUG_SYCL
 template <typename _Policy>
 inline void
diff --git a/test/general/lambda_naming.pass.cpp b/test/general/lambda_naming.pass.cpp
index 2b9b5c0a7d67..b3040432bc56 100644
--- a/test/general/lambda_naming.pass.cpp
+++ b/test/general/lambda_naming.pass.cpp
@@ -30,7 +30,7 @@ using namespace TestUtils;
 
 // This is the simple test for compilation only, to check if lambda naming works correctly
 int main() {
-#if __SYCL_UNNAMED_LAMBDA__ && _ONEDPL_BACKEND_SYCL
+#if _ONEDPL_BACKEND_SYCL
     const int n = 1000;
     sycl::buffer<int, 1> buf{ sycl::range<1>(n) };
     sycl::buffer<int, 1> out_buf{ sycl::range<1>(n) };
@@ -44,10 +44,13 @@ int main() {
             sycl::noinit);
 #else
             sycl::property::noinit{});
-#endif
+#endif // __cplusplus >= 201703L
+
     ::std::fill(policy, buf_begin_discard_write, buf_begin_discard_write + n, 1);
+#if __SYCL_UNNAMED_LAMBDA__
     ::std::sort(policy, buf_begin, buf_end);
     ::std::for_each(policy, buf_begin, buf_end, [](int& x) { x += 41; });
+
 #if !_ONEDPL_FPGA_DEVICE
     ::std::inplace_merge(policy, buf_begin, buf_begin + n / 2, buf_end);
     auto red_val = ::std::reduce(policy, buf_begin, buf_end, 1);
@@ -58,8 +61,15 @@ int main() {
     EXPECT_TRUE(!is_equal, "wrong return value from equal");
     auto does_1_exist = ::std::find(policy, buf_begin, buf_end, 1);
     EXPECT_TRUE(does_1_exist - buf_begin == 1000, "wrong return value from find");
-#endif
-#endif
+#endif // !_ONEDPL_FPGA_DEVICE
+
+#else
+    // ::std::for_each(policy, buf_begin, buf_end, [](int& x) { x++; }); // It's not allowed. Policy with different name is needed
+    ::std::for_each(oneapi::dpl::execution::make_device_policy<class ForEach>(policy), buf_begin, buf_end, [](int& x) { x++; });
+    auto red_val = ::std::reduce(policy, buf_begin, buf_end, 1);
+    EXPECT_TRUE(red_val == 2001, "wrong return value from reduce");
+#endif // __SYCL_UNNAMED_LAMBDA__
+#endif // _ONEDPL_BACKEND_SYCL
     ::std::cout << done() << ::std::endl;
     return 0;
 }
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
index 6384864b1963..f608f6aeb287 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
@@ -31,6 +31,9 @@
 #define _PSTL_TEST_SHIFT_RIGHT
 #endif
 
+template<typename Name>
+struct USM;
+
 struct test_shift
 {
     template <typename Policy, typename It, typename Algo>
@@ -86,7 +89,7 @@ struct test_shift
             //copying data to USM buffer
             ::std::copy_n(first, m, ptr.get());
 
-            auto het_res = algo(::std::forward<Policy>(exec), ptr.get(), ptr.get() + m, n);
+            auto het_res = algo(oneapi::dpl::execution::make_device_policy<USM<Algo>>(::std::forward<Policy>(exec)), ptr.get(), ptr.get() + m, n);
             res_idx = het_res - ptr.get();
 
             //3.2 check result
@@ -97,16 +100,6 @@ struct test_shift
 #endif
 };
 
-template <typename T, typename Size, typename Algo>
-void
-test_shift_by_type(Size m, Size n, Algo algo)
-{
-    TestUtils::Sequence<T> orig(m, [](::std::size_t v) -> T { return T(v); }); //fill data
-    TestUtils::Sequence<T> in(m, [](::std::size_t v) -> T { return T(v); }); //fill data
-
-    TestUtils::invoke_on_all_policies<>()(test_shift(), in.begin(), m, orig.begin(), n, algo);
-}
-
 struct shift_left_algo
 {
     template <typename Policy, typename It>
@@ -188,6 +181,21 @@ struct shift_right_algo
     }
 };
 
+template <typename T, typename Size>
+void
+test_shift_by_type(Size m, Size n)
+{
+    TestUtils::Sequence<T> orig(m, [](::std::size_t v) -> T { return T(v); }); //fill data
+    TestUtils::Sequence<T> in(m, [](::std::size_t v) -> T { return T(v); }); //fill data
+
+#ifdef _PSTL_TEST_SHIFT_LEFT
+    TestUtils::invoke_on_all_policies<0>()(test_shift(), in.begin(), m, orig.begin(), n, shift_left_algo{});
+#endif
+#ifdef _PSTL_TEST_SHIFT_RIGHT
+    TestUtils::invoke_on_all_policies<1>()(test_shift(), in.begin(), m, orig.begin(), n, shift_right_algo{});
+#endif
+}
+
 int
 main()
 {
@@ -195,13 +203,7 @@ main()
     for (long m = 0; m < N; m = m < 16 ? m + 1 : long(3.1415 * m))
         for (long n = 0; n < N; n = n < 16 ? n + 1 : long(3.1415 * n))
     {
-       //std::cout << "m: " << m << " n: " << n << std::endl;
-#ifdef _PSTL_TEST_SHIFT_LEFT
-       test_shift_by_type<int32_t>(m, n, shift_left_algo{}); 
-#endif
-#ifdef _PSTL_TEST_SHIFT_RIGHT
-       test_shift_by_type<int32_t>(m, n, shift_right_algo{});
-#endif
+       test_shift_by_type<int32_t>(m, n);
     }
 
     ::std::cout << TestUtils::done() << ::std::endl;
diff --git a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
index 0a6609e51b54..8317e0bfa910 100644
--- a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
@@ -103,7 +103,7 @@ void test_with_usm()
     res_head[5] = 9; res_head[6] = 9; res_head[7] = 9; res_head[8] = 9; res_head[9] = 9;
 
     // call algorithm
-    auto new_policy = oneapi::dpl::execution::make_device_policy(q);
+    auto new_policy = oneapi::dpl::execution::make_device_policy<class exclusive_scan_by_segment_1>(q);
     oneapi::dpl::exclusive_scan_by_segment(new_policy, key_head, key_head+n, val_head, res_head,
         (uint64_t)0, std::equal_to<uint64_t>(), std::plus<uint64_t>());
     q.wait();
@@ -120,7 +120,7 @@ void test_with_usm()
 
     // call algorithm on single element range
     res_head[0] = 9;
-    auto new_policy2 = oneapi::dpl::execution::make_device_policy(q);
+    auto new_policy2 = oneapi::dpl::execution::make_device_policy<class exclusive_scan_by_segment_2>(q);
     oneapi::dpl::exclusive_scan_by_segment(new_policy2, key_head, key_head+1, val_head, res_head,
         (uint64_t)0);
 
diff --git a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
index 0c130857f894..c9ab5a00d8cf 100644
--- a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
@@ -102,7 +102,7 @@ void test_with_usm()
     res_head[5] = 9; res_head[6] = 9; res_head[7] = 9; res_head[8] = 9; res_head[9] = 9;
 
     // call algorithm
-    auto new_policy = oneapi::dpl::execution::make_device_policy(q);
+    auto new_policy = oneapi::dpl::execution::make_device_policy<class inclusive_scan_by_segment_1>(q);
     oneapi::dpl::inclusive_scan_by_segment(new_policy, key_head, key_head+n, val_head, res_head,
         std::equal_to<uint64_t>(), std::plus<uint64_t>());
 
@@ -118,7 +118,7 @@ void test_with_usm()
 
     // call algorithm on single element range
     res_head[0] = 9;
-    auto new_policy2 = oneapi::dpl::execution::make_device_policy(q);
+    auto new_policy2 = oneapi::dpl::execution::make_device_policy<class inclusive_scan_by_segment_2>(q);
     oneapi::dpl::inclusive_scan_by_segment(new_policy2, key_head, key_head+1, val_head, res_head);
 
     // check values
diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
index 2d125d0a3b7d..96e48a104f37 100644
--- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
@@ -152,7 +152,7 @@ void test_with_usm()
     val_head[12] = 0;
 
     // call algorithm
-    auto new_policy = oneapi::dpl::execution::make_device_policy(q);
+    auto new_policy = oneapi::dpl::execution::make_device_policy<class reduce_by_segment_1>(q);
     auto res1 = oneapi::dpl::reduce_by_segment(new_policy, key_head, key_head + n, val_head, key_res_head, val_res_head);
 
     // check values
@@ -179,7 +179,7 @@ void test_with_usm()
     key_res_head[0] = 9;
     val_res_head[0] = 9;
 
-    auto new_policy2 = oneapi::dpl::execution::make_device_policy(q);
+    auto new_policy2 = oneapi::dpl::execution::make_device_policy<class reduce_by_segment_2>(q);
     auto res2 = oneapi::dpl::reduce_by_segment(new_policy2, key_head, key_head + 1, val_head, key_res_head, val_res_head);
 
     // check values
diff --git a/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp b/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp
index 49ce643713d8..76fe8c246cc1 100644
--- a/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp
+++ b/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp
@@ -50,7 +50,7 @@ main()
         auto exec = TestUtils::default_dpcpp_policy;
 
         merge(exec, all_view(A), all_view(B), all_view<T, sycl::access::mode::write>(D));
-        merge(exec, all_view(A), all_view(B), all_view<T, sycl::access::mode::write>(E), ::std::less<T>());
+        merge(oneapi::dpl::execution::make_device_policy<class merge_2>(exec), all_view(A), all_view(B), all_view<T, sycl::access::mode::write>(E), ::std::less<T>());
     }
 
     //check result
author	Andrey Fedorov <andrey.fedorov@intel.com>	2021-02-16 12:02:37 +0300
committer	GitHub <noreply@github.com>	2021-02-16 12:02:37 +0300
commit	94189435085a09fcca528f6313b0006d179bbf00 (patch)
tree	e31c81155879cdb7bf15a44217098202c7957a02
parent	Add FPGA_EMU tests (#118) (diff)
download	llvm-project-94189435085a09fcca528f6313b0006d179bbf00.tar.gz llvm-project-94189435085a09fcca528f6313b0006d179bbf00.tar.bz2 llvm-project-94189435085a09fcca528f6313b0006d179bbf00.zip