docs/html/parallel_8h_source.html

// Copyright 2022 The Manifold Authors.

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//      http://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

//

// Simple implementation of selected functions in PSTL.

// Iterators must be RandomAccessIterator.


#pragma once


#if (MANIFOLD_PAR == 1)

#include <tbb/combinable.h>

#include <tbb/parallel_for.h>

#include <tbb/parallel_invoke.h>

#include <tbb/parallel_reduce.h>

#include <tbb/parallel_scan.h>

#endif

#include <algorithm>

#include <numeric>


#include "manifold/iters.h"

namespace manifold {


enum class ExecutionPolicy {

  Par,

  Seq,

};


constexpr size_t kSeqThreshold = 1e4;

// ExecutionPolicy:

// - Sequential for small workload,

// - Parallel (CPU) for medium workload,

inline constexpr ExecutionPolicy autoPolicy(size_t size,

                                            size_t threshold = kSeqThreshold) {

  if (size <= threshold) {

    return ExecutionPolicy::Seq;

  }

  return ExecutionPolicy::Par;

}


template <typename Iter,

          typename Dummy = std::enable_if_t<!std::is_integral_v<Iter>>>

inline constexpr ExecutionPolicy autoPolicy(Iter first, Iter last,

                                            size_t threshold = kSeqThreshold) {

  if (static_cast<size_t>(std::distance(first, last)) <= threshold) {

    return ExecutionPolicy::Seq;

  }

  return ExecutionPolicy::Par;

}


template <typename InputIter, typename OutputIter>

void copy(ExecutionPolicy policy, InputIter first, InputIter last,

          OutputIter d_first);

template <typename InputIter, typename OutputIter>

void copy(InputIter first, InputIter last, OutputIter d_first);


#if (MANIFOLD_PAR == 1)

namespace details {

using manifold::kSeqThreshold;

// implementation from

// https://duvanenko.tech.blog/2018/01/14/parallel-merge/

// https://github.com/DragonSpit/ParallelAlgorithms

// note that the ranges are now [p, r) to fit our convention.

template <typename SrcIter, typename DestIter, typename Comp>

void mergeRec(SrcIter src, DestIter dest, size_t p1, size_t r1, size_t p2,

              size_t r2, size_t p3, Comp comp) {

  size_t length1 = r1 - p1;

  size_t length2 = r2 - p2;

  if (length1 < length2) {

    std::swap(p1, p2);

    std::swap(r1, r2);

    std::swap(length1, length2);

  }

  if (length1 == 0) return;

  if (length1 + length2 <= kSeqThreshold) {

    std::merge(src + p1, src + r1, src + p2, src + r2, dest + p3, comp);

  } else {

    size_t q1 = p1 + length1 / 2;

    size_t q2 =

        std::distance(src, std::lower_bound(src + p2, src + r2, src[q1], comp));

    size_t q3 = p3 + (q1 - p1) + (q2 - p2);

    dest[q3] = src[q1];

    tbb::parallel_invoke(

        [=] { mergeRec(src, dest, p1, q1, p2, q2, p3, comp); },

        [=] { mergeRec(src, dest, q1 + 1, r1, q2, r2, q3 + 1, comp); });

  }

}


template <typename SrcIter, typename DestIter, typename Comp>

void mergeSortRec(SrcIter src, DestIter dest, size_t begin, size_t end,

                  Comp comp) {

  size_t numElements = end - begin;

  if (numElements <= kSeqThreshold) {

    std::copy(src + begin, src + end, dest + begin);

    std::stable_sort(dest + begin, dest + end, comp);

  } else {

    size_t middle = begin + numElements / 2;

    tbb::parallel_invoke([=] { mergeSortRec(dest, src, begin, middle, comp); },

                         [=] { mergeSortRec(dest, src, middle, end, comp); });

    mergeRec(src, dest, begin, middle, middle, end, begin, comp);

  }

}


template <typename T, typename InputIter, typename OutputIter, typename BinOp>

struct ScanBody {

  T sum;

  T identity;

  BinOp &f;

  InputIter input;

  OutputIter output;


  ScanBody(T sum, T identity, BinOp &f, InputIter input, OutputIter output)

      : sum(sum), identity(identity), f(f), input(input), output(output) {}

  ScanBody(ScanBody &b, tbb::split)

      : sum(b.identity),

        identity(b.identity),

        f(b.f),

        input(b.input),

        output(b.output) {}

  template <typename Tag>

  void operator()(const tbb::blocked_range<size_t> &r, Tag) {

    T temp = sum;

    for (size_t i = r.begin(); i < r.end(); ++i) {

      T inputTmp = input[i];

      if (Tag::is_final_scan()) output[i] = temp;

      temp = f(temp, inputTmp);

    }

    sum = temp;

  }

  T get_sum() const { return sum; }

  void reverse_join(ScanBody &a) { sum = f(a.sum, sum); }

  void assign(ScanBody &b) { sum = b.sum; }

};


template <typename InputIter, typename OutputIter, typename P>

struct CopyIfScanBody {

  size_t sum;

  P &pred;

  InputIter input;

  OutputIter output;


  CopyIfScanBody(P &pred, InputIter input, OutputIter output)

      : sum(0), pred(pred), input(input), output(output) {}

  CopyIfScanBody(CopyIfScanBody &b, tbb::split)

      : sum(0), pred(b.pred), input(b.input), output(b.output) {}

  template <typename Tag>

  void operator()(const tbb::blocked_range<size_t> &r, Tag) {

    size_t temp = sum;

    for (size_t i = r.begin(); i < r.end(); ++i) {

      if (pred(i)) {

        temp += 1;

        if (Tag::is_final_scan()) output[temp - 1] = input[i];

      }

    }

    sum = temp;

  }

  size_t get_sum() const { return sum; }

  void reverse_join(CopyIfScanBody &a) { sum = a.sum + sum; }

  void assign(CopyIfScanBody &b) { sum = b.sum; }

};


template <typename N, const int K>

struct Hist {

  using SizeType = N;

  static constexpr int k = K;

  N hist[k][256] = {{0}};

  void merge(const Hist<N, K> &other) {

    for (int i = 0; i < k; ++i)

      for (int j = 0; j < 256; ++j) hist[i][j] += other.hist[i][j];

  }

  void prefixSum(N total, bool *canSkip) {

    for (int i = 0; i < k; ++i) {

      size_t count = 0;

      for (int j = 0; j < 256; ++j) {

        N tmp = hist[i][j];

        hist[i][j] = count;

        count += tmp;

        if (tmp == total) canSkip[i] = true;

      }

    }

  }

};


template <typename T, typename H>

void histogram(T *ptr, typename H::SizeType n, H &hist) {

  auto worker = [](T *ptr, typename H::SizeType n, H &hist) {

    for (typename H::SizeType i = 0; i < n; ++i)

      for (int k = 0; k < hist.k; ++k)

        ++hist.hist[k][(ptr[i] >> (8 * k)) & 0xFF];

  };

  if (n < kSeqThreshold) {

    worker(ptr, n, hist);

  } else {

    tbb::combinable<H> store;

    tbb::parallel_for(

        tbb::blocked_range<typename H::SizeType>(0, n, kSeqThreshold),

        [&worker, &store, ptr](const auto &r) {

          worker(ptr + r.begin(), r.end() - r.begin(), store.local());

        });

    store.combine_each([&hist](const H &h) { hist.merge(h); });

  }

}


template <typename T, typename H>

void shuffle(T *src, T *target, typename H::SizeType n, H &hist, int k) {

  for (typename H::SizeType i = 0; i < n; ++i)

    target[hist.hist[k][(src[i] >> (8 * k)) & 0xFF]++] = src[i];

}


template <typename T, typename SizeType>

bool LSB_radix_sort(T *input, T *tmp, SizeType n) {

  Hist<SizeType, sizeof(T) / sizeof(char)> hist;

  if (std::is_sorted(input, input + n)) return false;

  histogram(input, n, hist);

  bool canSkip[hist.k] = {0};

  hist.prefixSum(n, canSkip);

  T *a = input, *b = tmp;

  for (int k = 0; k < hist.k; ++k) {

    if (!canSkip[k]) {

      shuffle(a, b, n, hist, k);

      std::swap(a, b);

    }

  }

  return a == tmp;

}


// LSB radix sort with merge

template <typename T, typename SizeType>

struct SortedRange {

  T *input, *tmp;

  SizeType offset = 0, length = 0;

  bool inTmp = false;


  SortedRange(T *input, T *tmp, SizeType offset = 0, SizeType length = 0)

      : input(input), tmp(tmp), offset(offset), length(length) {}

  SortedRange(SortedRange<T, SizeType> &r, tbb::split)

      : input(r.input), tmp(r.tmp) {}

  void operator()(const tbb::blocked_range<SizeType> &range) {

    SortedRange<T, SizeType> rhs(input, tmp, range.begin(),

                                 range.end() - range.begin());

    rhs.inTmp =

        LSB_radix_sort(input + rhs.offset, tmp + rhs.offset, rhs.length);

    if (length == 0)

      *this = rhs;

    else

      join(rhs);

  }

  bool swapBuffer() const {

    T *src = input, *target = tmp;

    if (inTmp) std::swap(src, target);

    copy(src + offset, src + offset + length, target + offset);

    return !inTmp;

  }

  void join(const SortedRange<T, SizeType> &rhs) {

    if (inTmp != rhs.inTmp) {

      if (length < rhs.length)

        inTmp = swapBuffer();

      else

        rhs.swapBuffer();

    }

    T *src = input, *target = tmp;

    if (inTmp) std::swap(src, target);

    if (src[offset + length - 1] > src[rhs.offset]) {

      mergeRec(src, target, offset, offset + length, rhs.offset,

               rhs.offset + rhs.length, offset, std::less<T>());

      inTmp = !inTmp;

    }

    length += rhs.length;

  }

};


template <typename T, typename SizeTy>

void radix_sort(T *input, SizeTy n) {

  T *aux = new T[n];

  SizeTy blockSize = std::max(n / tbb::this_task_arena::max_concurrency() / 4,

                              static_cast<SizeTy>(kSeqThreshold / sizeof(T)));

  SortedRange<T, SizeTy> result(input, aux);

  tbb::parallel_reduce(tbb::blocked_range<SizeTy>(0, n, blockSize), result);

  if (result.inTmp) copy(aux, aux + n, input);

  delete[] aux;

}


template <typename Iterator,

          typename T = typename std::iterator_traits<Iterator>::value_type,

          typename Comp = decltype(std::less<T>())>

void mergeSort(ExecutionPolicy policy, Iterator first, Iterator last,

               Comp comp) {

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    // apparently this prioritizes threads inside here?

    tbb::this_task_arena::isolate([&] {

      size_t length = std::distance(first, last);

      T *tmp = new T[length];

      copy(policy, first, last, tmp);

      details::mergeSortRec(tmp, first, 0, length, comp);

      delete[] tmp;

    });

    return;

  }

#endif

  std::stable_sort(first, last, comp);

}


// stable_sort using merge sort.

//

// For simpler implementation, we do not support types that are not trivially

// destructable.

template <typename Iterator,

          typename T = typename std::iterator_traits<Iterator>::value_type,

          typename Dummy = void>

struct SortFunctor {

  void operator()(ExecutionPolicy policy, Iterator first, Iterator last) {

    static_assert(

        std::is_convertible_v<

            typename std::iterator_traits<Iterator>::iterator_category,

            std::random_access_iterator_tag>,

        "You can only parallelize RandomAccessIterator.");

    static_assert(std::is_trivially_destructible_v<T>,

                  "Our simple implementation does not support types that are "

                  "not trivially destructable.");

    return mergeSort(policy, first, last, std::less<T>());

  }

};


// stable_sort specialized with radix sort for integral types.

// Typically faster than merge sort.

template <typename Iterator, typename T>

struct SortFunctor<

    Iterator, T,

    std::enable_if_t<

        std::is_integral_v<T> &&

        std::is_pointer_v<typename std::iterator_traits<Iterator>::pointer>>> {

  void operator()(ExecutionPolicy policy, Iterator first, Iterator last) {

    static_assert(

        std::is_convertible_v<

            typename std::iterator_traits<Iterator>::iterator_category,

            std::random_access_iterator_tag>,

        "You can only parallelize RandomAccessIterator.");

    static_assert(std::is_trivially_destructible_v<T>,

                  "Our simple implementation does not support types that are "

                  "not trivially destructable.");

#if (MANIFOLD_PAR == 1)

    if (policy == ExecutionPolicy::Par) {

      radix_sort(&*first, static_cast<size_t>(std::distance(first, last)));

      return;

    }

#endif

    stable_sort(policy, first, last, std::less<T>());

  }

};


}  // namespace details


#endif


// Applies the function `f` to each element in the range `[first, last)`

template <typename Iter, typename F>

void for_each(ExecutionPolicy policy, Iter first, Iter last, F f) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<Iter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    tbb::parallel_for(tbb::blocked_range<Iter>(first, last),

                      [&f](const tbb::blocked_range<Iter> &range) {

                        for (Iter i = range.begin(); i != range.end(); i++)

                          f(*i);

                      });

    return;

  }

#endif

  std::for_each(first, last, f);

}


// Applies the function `f` to each element in the range `[first, last)`

template <typename Iter, typename F>

void for_each_n(ExecutionPolicy policy, Iter first, size_t n, F f) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<Iter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

  for_each(policy, first, first + n, f);

}


// Reduce the range `[first, last)` using a binary operation `f` with an initial

// value `init`.

//

// The binary operation should be commutative and associative. Otherwise, the

// result is non-deterministic.

template <typename InputIter, typename BinaryOp,

          typename T = typename std::iterator_traits<InputIter>::value_type>

T reduce(ExecutionPolicy policy, InputIter first, InputIter last, T init,

         BinaryOp f) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<InputIter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    // should we use deterministic reduce here?

    return tbb::parallel_reduce(

        tbb::blocked_range<InputIter>(first, last, details::kSeqThreshold),

        init,

        [&f](const tbb::blocked_range<InputIter> &range, T value) {

          return std::reduce(range.begin(), range.end(), value, f);

        },

        f);

  }

#endif

  return std::reduce(first, last, init, f);

}


// Reduce the range `[first, last)` using a binary operation `f` with an initial

// value `init`.

//

// The binary operation should be commutative and associative. Otherwise, the

// result is non-deterministic.

template <typename InputIter, typename BinaryOp,

          typename T = typename std::iterator_traits<InputIter>::value_type>

T reduce(InputIter first, InputIter last, T init, BinaryOp f) {

  return reduce(autoPolicy(first, last, 1e5), first, last, init, f);

}


// Transform and reduce the range `[first, last)` by first applying a unary

// function `g`, and then combining the results using a binary operation `f`

// with an initial value `init`.

//

// The binary operation should be commutative and associative. Otherwise, the

// result is non-deterministic.

template <typename InputIter, typename BinaryOp, typename UnaryOp,

          typename T = std::invoke_result_t<

              UnaryOp, typename std::iterator_traits<InputIter>::value_type>>

T transform_reduce(ExecutionPolicy policy, InputIter first, InputIter last,

                   T init, BinaryOp f, UnaryOp g) {

  return reduce(policy, TransformIterator(first, g), TransformIterator(last, g),

                init, f);

}


// Transform and reduce the range `[first, last)` by first applying a unary

// function `g`, and then combining the results using a binary operation `f`

// with an initial value `init`.

//

// The binary operation should be commutative and associative. Otherwise, the

// result is non-deterministic.

template <typename InputIter, typename BinaryOp, typename UnaryOp,

          typename T = std::invoke_result_t<

              UnaryOp, typename std::iterator_traits<InputIter>::value_type>>

T transform_reduce(InputIter first, InputIter last, T init, BinaryOp f,

                   UnaryOp g) {

  return manifold::reduce(TransformIterator(first, g),

                          TransformIterator(last, g), init, f);

}


// Compute the inclusive prefix sum for the range `[first, last)`

// using the summation operator, and store the result in the range

// starting from `d_first`.

//

// The input range `[first, last)` and

// the output range `[d_first, d_first + last - first)`

// must be equal or non-overlapping.

template <typename InputIter, typename OutputIter,

          typename T = typename std::iterator_traits<InputIter>::value_type>

void inclusive_scan(ExecutionPolicy policy, InputIter first, InputIter last,

                    OutputIter d_first) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<InputIter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

  static_assert(

      std::is_convertible_v<

          typename std::iterator_traits<OutputIter>::iterator_category,

          std::random_access_iterator_tag>,

      "You can only parallelize RandomAccessIterator.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    tbb::parallel_scan(

        tbb::blocked_range<size_t>(0, std::distance(first, last)),

        static_cast<T>(0),

        [&](const tbb::blocked_range<size_t> &range, T sum,

            bool is_final_scan) {

          T temp = sum;

          for (size_t i = range.begin(); i < range.end(); ++i) {

            temp = temp + first[i];

            if (is_final_scan) d_first[i] = temp;

          }

          return temp;

        },

        std::plus<T>());

    return;

  }

#endif

  std::inclusive_scan(first, last, d_first);

}


// Compute the inclusive prefix sum for the range `[first, last)` using the

// summation operator, and store the result in the range

// starting from `d_first`.

//

// The input range `[first, last)` and

// the output range `[d_first, d_first + last - first)`

// must be equal or non-overlapping.

template <typename InputIter, typename OutputIter,

          typename T = typename std::iterator_traits<InputIter>::value_type>

void inclusive_scan(InputIter first, InputIter last, OutputIter d_first) {

  return inclusive_scan(autoPolicy(first, last, 1e5), first, last, d_first);

}


// Compute the inclusive prefix sum for the range `[first, last)` using the

// binary operator `f`, with initial value `init` and

// identity element `identity`, and store the result in the range

// starting from `d_first`.

//

// This is different from `exclusive_scan` in the sequential algorithm by

// requiring an identity element. This is needed so that each block can be

// scanned in parallel and combined later.

//

// The input range `[first, last)` and

// the output range `[d_first, d_first + last - first)`

// must be equal or non-overlapping.

template <typename InputIter, typename OutputIter,

          typename BinOp = decltype(std::plus<typename std::iterator_traits<

                                        InputIter>::value_type>()),

          typename T = typename std::iterator_traits<InputIter>::value_type>

void exclusive_scan(ExecutionPolicy policy, InputIter first, InputIter last,

                    OutputIter d_first, T init = static_cast<T>(0),

                    BinOp f = std::plus<T>(), T identity = static_cast<T>(0)) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<InputIter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

  static_assert(

      std::is_convertible_v<

          typename std::iterator_traits<OutputIter>::iterator_category,

          std::random_access_iterator_tag>,

      "You can only parallelize RandomAccessIterator.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    details::ScanBody<T, InputIter, OutputIter, BinOp> body(init, identity, f,

                                                            first, d_first);

    tbb::parallel_scan(

        tbb::blocked_range<size_t>(0, std::distance(first, last)), body);

    return;

  }

#endif

  std::exclusive_scan(first, last, d_first, init, f);

}


// Compute the inclusive prefix sum for the range `[first, last)` using the

// binary operator `f`, with initial value `init` and

// identity element `identity`, and store the result in the range

// starting from `d_first`.

//

// This is different from `exclusive_scan` in the sequential algorithm by

// requiring an identity element. This is needed so that each block can be

// scanned in parallel and combined later.

//

// The input range `[first, last)` and

// the output range `[d_first, d_first + last - first)`

// must be equal or non-overlapping.

template <typename InputIter, typename OutputIter,

          typename BinOp = decltype(std::plus<typename std::iterator_traits<

                                        InputIter>::value_type>()),

          typename T = typename std::iterator_traits<InputIter>::value_type>

void exclusive_scan(InputIter first, InputIter last, OutputIter d_first,

                    T init = static_cast<T>(0), BinOp f = std::plus<T>(),

                    T identity = static_cast<T>(0)) {

  exclusive_scan(autoPolicy(first, last, 1e5), first, last, d_first, init, f,

                 identity);

}


// Apply function `f` on the input range `[first, last)` and store the result in

// the range starting from `d_first`.

//

// The input range `[first, last)` and

// the output range `[d_first, d_first + last - first)`

// must be equal or non-overlapping.

template <typename InputIter, typename OutputIter, typename F>

void transform(ExecutionPolicy policy, InputIter first, InputIter last,

               OutputIter d_first, F f) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<InputIter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

  static_assert(

      std::is_convertible_v<

          typename std::iterator_traits<OutputIter>::iterator_category,

          std::random_access_iterator_tag>,

      "You can only parallelize RandomAccessIterator.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    tbb::parallel_for(tbb::blocked_range<size_t>(

                          0, static_cast<size_t>(std::distance(first, last))),

                      [&](const tbb::blocked_range<size_t> &range) {

                        std::transform(first + range.begin(),

                                       first + range.end(),

                                       d_first + range.begin(), f);

                      });

    return;

  }

#endif

  std::transform(first, last, d_first, f);

}


// Apply function `f` on the input range `[first, last)` and store the result in

// the range starting from `d_first`.

//

// The input range `[first, last)` and

// the output range `[d_first, d_first + last - first)`

// must be equal or non-overlapping.

template <typename InputIter, typename OutputIter, typename F>

void transform(InputIter first, InputIter last, OutputIter d_first, F f) {

  transform(autoPolicy(first, last, 1e5), first, last, d_first, f);

}


// Copy the input range `[first, last)` to the output range

// starting from `d_first`.

//

// The input range `[first, last)` and

// the output range `[d_first, d_first + last - first)`

// must not overlap.

template <typename InputIter, typename OutputIter>

void copy(ExecutionPolicy policy, InputIter first, InputIter last,

          OutputIter d_first) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<InputIter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

  static_assert(

      std::is_convertible_v<

          typename std::iterator_traits<OutputIter>::iterator_category,

          std::random_access_iterator_tag>,

      "You can only parallelize RandomAccessIterator.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    tbb::parallel_for(tbb::blocked_range<size_t>(

                          0, static_cast<size_t>(std::distance(first, last)),

                          details::kSeqThreshold),

                      [&](const tbb::blocked_range<size_t> &range) {

                        std::copy(first + range.begin(), first + range.end(),

                                  d_first + range.begin());

                      });

    return;

  }

#endif

  std::copy(first, last, d_first);

}


// Copy the input range `[first, last)` to the output range

// starting from `d_first`.

//

// The input range `[first, last)` and

// the output range `[d_first, d_first + last - first)`

// must not overlap.

template <typename InputIter, typename OutputIter>

void copy(InputIter first, InputIter last, OutputIter d_first) {

  copy(autoPolicy(first, last, 1e6), first, last, d_first);

}


// Copy the input range `[first, first + n)` to the output range

// starting from `d_first`.

//

// The input range `[first, first + n)` and

// the output range `[d_first, d_first + n)`

// must not overlap.

template <typename InputIter, typename OutputIter>

void copy_n(ExecutionPolicy policy, InputIter first, size_t n,

            OutputIter d_first) {

  copy(policy, first, first + n, d_first);

}


// Copy the input range `[first, first + n)` to the output range

// starting from `d_first`.

//

// The input range `[first, first + n)` and

// the output range `[d_first, d_first + n)`

// must not overlap.

template <typename InputIter, typename OutputIter>

void copy_n(InputIter first, size_t n, OutputIter d_first) {

  copy(autoPolicy(n, 1e6), first, first + n, d_first);

}


// Fill the range `[first, last)` with `value`.

template <typename OutputIter, typename T>

void fill(ExecutionPolicy policy, OutputIter first, OutputIter last, T value) {

  static_assert(

      std::is_convertible_v<

          typename std::iterator_traits<OutputIter>::iterator_category,

          std::random_access_iterator_tag>,

      "You can only parallelize RandomAccessIterator.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    tbb::parallel_for(tbb::blocked_range<OutputIter>(first, last),

                      [&](const tbb::blocked_range<OutputIter> &range) {

                        std::fill(range.begin(), range.end(), value);

                      });

    return;

  }

#endif

  std::fill(first, last, value);

}


// Fill the range `[first, last)` with `value`.

template <typename OutputIter, typename T>

void fill(OutputIter first, OutputIter last, T value) {

  fill(autoPolicy(first, last, 5e5), first, last, value);

}


// Count the number of elements in the input range `[first, last)` satisfying

// predicate `pred`, i.e. `pred(x) == true`.

template <typename InputIter, typename P>

size_t count_if(ExecutionPolicy policy, InputIter first, InputIter last,

                P pred) {

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    return reduce(policy, TransformIterator(first, pred),

                  TransformIterator(last, pred), 0, std::plus<size_t>());

  }

#endif

  return std::count_if(first, last, pred);

}


// Count the number of elements in the input range `[first, last)` satisfying

// predicate `pred`, i.e. `pred(x) == true`.

template <typename InputIter, typename P>

size_t count_if(InputIter first, InputIter last, P pred) {

  return count_if(autoPolicy(first, last, 1e4), first, last, pred);

}


// Check if all elements in the input range `[first, last)` satisfy

// predicate `pred`, i.e. `pred(x) == true`.

template <typename InputIter, typename P>

bool all_of(ExecutionPolicy policy, InputIter first, InputIter last, P pred) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<InputIter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    // should we use deterministic reduce here?

    return tbb::parallel_reduce(

        tbb::blocked_range<InputIter>(first, last), true,

        [&](const tbb::blocked_range<InputIter> &range, bool value) {

          if (!value) return false;

          for (InputIter i = range.begin(); i != range.end(); i++)

            if (!pred(*i)) return false;

          return true;

        },

        [](bool a, bool b) { return a && b; });

  }

#endif

  return std::all_of(first, last, pred);

}


// Check if all elements in the input range `[first, last)` satisfy

// predicate `pred`, i.e. `pred(x) == true`.

template <typename InputIter, typename P>

bool all_of(InputIter first, InputIter last, P pred) {

  return all_of(autoPolicy(first, last, 1e5), first, last, pred);

}


// Copy values in the input range `[first, last)` to the output range

// starting from `d_first` that satisfies the predicate `pred`,

// i.e. `pred(x) == true`, and returns `d_first + n` where `n` is the number of

// times the predicate is evaluated to true.

//

// This function is stable, meaning that the relative order of elements in the

// output range remains unchanged.

//

// The input range `[first, last)` and

// the output range `[d_first, d_first + last - first)`

// must not overlap.

template <typename InputIter, typename OutputIter, typename P>

OutputIter copy_if(ExecutionPolicy policy, InputIter first, InputIter last,

                   OutputIter d_first, P pred) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<InputIter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

  static_assert(

      std::is_convertible_v<

          typename std::iterator_traits<OutputIter>::iterator_category,

          std::random_access_iterator_tag>,

      "You can only parallelize RandomAccessIterator.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    auto pred2 = [&](size_t i) { return pred(first[i]); };

    details::CopyIfScanBody body(pred2, first, d_first);

    tbb::parallel_scan(

        tbb::blocked_range<size_t>(0, std::distance(first, last)), body);

    return d_first + body.get_sum();

  }

#endif

  return std::copy_if(first, last, d_first, pred);

}


// Copy values in the input range `[first, last)` to the output range

// starting from `d_first` that satisfies the predicate `pred`, i.e. `pred(x) ==

// true`, and returns `d_first + n` where `n` is the number of times the

// predicate is evaluated to true.

//

// This function is stable, meaning that the relative order of elements in the

// output range remains unchanged.

//

// The input range `[first, last)` and

// the output range `[d_first, d_first + last - first)`

// must not overlap.

template <typename InputIter, typename OutputIter, typename P>

OutputIter copy_if(InputIter first, InputIter last, OutputIter d_first,

                   P pred) {

  return copy_if(autoPolicy(first, last, 1e5), first, last, d_first, pred);

}


// Remove values in the input range `[first, last)` that satisfies

// the predicate `pred`, i.e. `pred(x) == true`, and returns `first + n`

// where `n` is the number of times the predicate is evaluated to false.

//

// This function is stable, meaning that the relative order of elements that

// remained are unchanged.

//

// Only trivially destructable types are supported.

template <typename Iter, typename P,

          typename T = typename std::iterator_traits<Iter>::value_type>

Iter remove_if(ExecutionPolicy policy, Iter first, Iter last, P pred) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<Iter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

  static_assert(std::is_trivially_destructible_v<T>,

                "Our simple implementation does not support types that are "

                "not trivially destructable.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    T *tmp = new T[std::distance(first, last)];

    auto back =

        copy_if(policy, first, last, tmp, [&](T v) { return !pred(v); });

    copy(policy, tmp, back, first);

    auto d = std::distance(tmp, back);

    delete[] tmp;

    return first + d;

  }

#endif

  return std::remove_if(first, last, pred);

}


// Remove values in the input range `[first, last)` that satisfies

// the predicate `pred`, i.e. `pred(x) == true`, and

// returns `first + n` where `n` is the number of times the predicate is

// evaluated to false.

//

// This function is stable, meaning that the relative order of elements that

// remained are unchanged.

//

// Only trivially destructable types are supported.

template <typename Iter, typename P,

          typename T = typename std::iterator_traits<Iter>::value_type>

Iter remove_if(Iter first, Iter last, P pred) {

  return remove_if(autoPolicy(first, last, 1e4), first, last, pred);

}


// Remove values in the input range `[first, last)` that are equal to `value`.

// Returns `first + n` where `n` is the number of values

// that are not equal to `value`.

//

// This function is stable, meaning that the relative order of elements that

// remained are unchanged.

//

// Only trivially destructable types are supported.

template <typename Iter,

          typename T = typename std::iterator_traits<Iter>::value_type>

Iter remove(ExecutionPolicy policy, Iter first, Iter last, T value) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<Iter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

  static_assert(std::is_trivially_destructible_v<T>,

                "Our simple implementation does not support types that are "

                "not trivially destructable.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par) {

    T *tmp = new T[std::distance(first, last)];

    auto back =

        copy_if(policy, first, last, tmp, [&](T v) { return v != value; });

    copy(policy, tmp, back, first);

    auto d = std::distance(tmp, back);

    delete[] tmp;

    return first + d;

  }

#endif

  return std::remove(first, last, value);

}


// Remove values in the input range `[first, last)` that are equal to `value`.

// Returns `first + n` where `n` is the number of values

// that are not equal to `value`.

//

// This function is stable, meaning that the relative order of elements that

// remained are unchanged.

//

// Only trivially destructable types are supported.

template <typename Iter,

          typename T = typename std::iterator_traits<Iter>::value_type>

Iter remove(Iter first, Iter last, T value) {

  return remove(autoPolicy(first, last, 1e4), first, last, value);

}


// For each group of consecutive elements in the range `[first, last)` with the

// same value, unique removes all but the first element of the group. The return

// value is an iterator `new_last` such that no two consecutive elements in the

// range `[first, new_last)` are equal.

//

// This function is stable, meaning that the relative order of elements that

// remained are unchanged.

//

// Only trivially destructable types are supported.

template <typename Iter,

          typename T = typename std::iterator_traits<Iter>::value_type>

Iter unique(ExecutionPolicy policy, Iter first, Iter last) {

  static_assert(std::is_convertible_v<

                    typename std::iterator_traits<Iter>::iterator_category,

                    std::random_access_iterator_tag>,

                "You can only parallelize RandomAccessIterator.");

  static_assert(std::is_trivially_destructible_v<T>,

                "Our simple implementation does not support types that are "

                "not trivially destructable.");

#if (MANIFOLD_PAR == 1)

  if (policy == ExecutionPolicy::Par && first != last) {

    Iter newSrcStart = first;

    // cap the maximum buffer size, proved to be beneficial for unique with huge

    // array size

    constexpr size_t MAX_BUFFER_SIZE = 1 << 16;

    T *tmp = new T[std::min(MAX_BUFFER_SIZE,

                            static_cast<size_t>(std::distance(first, last)))];

    auto pred = [&](size_t i) { return tmp[i] != tmp[i + 1]; };

    do {

      size_t length =

          std::min(MAX_BUFFER_SIZE,

                   static_cast<size_t>(std::distance(newSrcStart, last)));

      copy(policy, newSrcStart, newSrcStart + length, tmp);

      *first = *newSrcStart;

      // this is not a typo, the index i is offset by 1, so to compare an

      // element with its predecessor we need to compare i and i + 1.

      details::CopyIfScanBody body(pred, tmp + 1, first + 1);

      tbb::parallel_scan(tbb::blocked_range<size_t>(0, length - 1), body);

      first += body.get_sum() + 1;

      newSrcStart += length;

    } while (newSrcStart != last);

    delete[] tmp;

    return first;

  }

#endif

  return std::unique(first, last);

}


// For each group of consecutive elements in the range `[first, last)` with the

// same value, unique removes all but the first element of the group. The return

// value is an iterator `new_last` such that no two consecutive elements in the

// range `[first, new_last)` are equal.

//

// This function is stable, meaning that the relative order of elements that

// remained are unchanged.

//

// Only trivially destructable types are supported.

template <typename Iter,

          typename T = typename std::iterator_traits<Iter>::value_type>

Iter unique(Iter first, Iter last) {

  return unique(autoPolicy(first, last, 1e4), first, last);

}


// Sort the input range `[first, last)` in ascending order.

//

// This function is stable, meaning that the relative order of elements that are

// incomparable remains unchanged.

//

// Only trivially destructable types are supported.

template <typename Iterator,

          typename T = typename std::iterator_traits<Iterator>::value_type>

void stable_sort(ExecutionPolicy policy, Iterator first, Iterator last) {

#if (MANIFOLD_PAR == 1)

  details::SortFunctor<Iterator, T>()(policy, first, last);

#else

  std::stable_sort(first, last);

#endif

}


// Sort the input range `[first, last)` in ascending order.

//

// This function is stable, meaning that the relative order of elements that are

// incomparable remains unchanged.

//

// Only trivially destructable types are supported.

template <typename Iterator,

          typename T = typename std::iterator_traits<Iterator>::value_type>

void stable_sort(Iterator first, Iterator last) {

  stable_sort(autoPolicy(first, last, 1e4), first, last);

}


// Sort the input range `[first, last)` in ascending order using the comparison

// function `comp`.

//

// This function is stable, meaning that the relative order of elements that are

// incomparable remains unchanged.

//

// Only trivially destructable types are supported.

template <typename Iterator,

          typename T = typename std::iterator_traits<Iterator>::value_type,

          typename Comp = decltype(std::less<T>())>

void stable_sort(ExecutionPolicy policy, Iterator first, Iterator last,

                 Comp comp) {

#if (MANIFOLD_PAR == 1)

  details::mergeSort(policy, first, last, comp);

#else

  std::stable_sort(first, last, comp);

#endif

}


// Sort the input range `[first, last)` in ascending order using the comparison

// function `comp`.

//

// This function is stable, meaning that the relative order of elements that are

// incomparable remains unchanged.

//

// Only trivially destructable types are supported.

template <typename Iterator,

          typename T = typename std::iterator_traits<Iterator>::value_type,

          typename Comp = decltype(std::less<T>())>

void stable_sort(Iterator first, Iterator last, Comp comp) {

  stable_sort(autoPolicy(first, last, 1e4), first, last, comp);

}


// `scatter` copies elements from a source range into an output array according

// to a map. For each iterator `i` in the range `[first, last)`, the value `*i`

// is assigned to `outputFirst[mapFirst[i - first]]`.  If the same index appears

// more than once in the range `[mapFirst, mapFirst + (last - first))`, the

// result is undefined.

//

// The map range, input range and the output range must not overlap.

template <typename InputIterator1, typename InputIterator2,

          typename OutputIterator>

void scatter(ExecutionPolicy policy, InputIterator1 first, InputIterator1 last,

             InputIterator2 mapFirst, OutputIterator outputFirst) {

  for_each(policy, countAt(0),

           countAt(static_cast<size_t>(std::distance(first, last))),

           [first, mapFirst, outputFirst](size_t i) {

             outputFirst[mapFirst[i]] = first[i];

           });

}


// `scatter` copies elements from a source range into an output array according

// to a map. For each iterator `i` in the range `[first, last)`, the value `*i`

// is assigned to `outputFirst[mapFirst[i - first]]`. If the same index appears

// more than once in the range `[mapFirst, mapFirst + (last - first))`,

// the result is undefined.

//

// The map range, input range and the output range must not overlap.

template <typename InputIterator1, typename InputIterator2,

          typename OutputIterator>

void scatter(InputIterator1 first, InputIterator1 last, InputIterator2 mapFirst,

             OutputIterator outputFirst) {

  scatter(autoPolicy(first, last, 1e5), first, last, mapFirst, outputFirst);

}


// `gather` copies elements from a source array into a destination range

// according to a map. For each input iterator `i`

// in the range `[mapFirst, mapLast)`, the value `inputFirst[*i]`

// is assigned to `outputFirst[i - map_first]`.

//

// The map range, input range and the output range must not overlap.

template <typename InputIterator, typename RandomAccessIterator,

          typename OutputIterator>

void gather(ExecutionPolicy policy, InputIterator mapFirst,

            InputIterator mapLast, RandomAccessIterator inputFirst,

            OutputIterator outputFirst) {

  for_each(policy, countAt(0),

           countAt(static_cast<size_t>(std::distance(mapFirst, mapLast))),

           [mapFirst, inputFirst, outputFirst](size_t i) {

             outputFirst[i] = inputFirst[mapFirst[i]];

           });

}


// `gather` copies elements from a source array into a destination range

// according to a map. For each input iterator `i`

// in the range `[mapFirst, mapLast)`, the value `inputFirst[*i]`

// is assigned to `outputFirst[i - map_first]`.

//

// The map range, input range and the output range must not overlap.

template <typename InputIterator, typename RandomAccessIterator,

          typename OutputIterator>

void gather(InputIterator mapFirst, InputIterator mapLast,

            RandomAccessIterator inputFirst, OutputIterator outputFirst) {

  gather(autoPolicy(std::distance(mapFirst, mapLast), 1e5), mapFirst, mapLast,

         inputFirst, outputFirst);

}


// Write `[0, last - first)` to the range `[first, last)`.

template <typename Iterator>

void sequence(ExecutionPolicy policy, Iterator first, Iterator last) {

  for_each(policy, countAt(0),

           countAt(static_cast<size_t>(std::distance(first, last))),

           [first](size_t i) { first[i] = i; });

}


// Write `[0, last - first)` to the range `[first, last)`.

template <typename Iterator>

void sequence(Iterator first, Iterator last) {

  sequence(autoPolicy(first, last, 1e5), first, last);

}


}  // namespace manifold

manifold
Definition common.h:22