Barretenberg: src/barretenberg/benchmark/pippenger_bench/thread_scaling.bench.cpp Source File

#include "barretenberg/common/thread.hpp"

#include "barretenberg/ecc/curves/bn254/bn254.hpp"

#include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"

#include "barretenberg/numeric/random/engine.hpp"

#include "barretenberg/srs/global_crs.hpp"


#include <benchmark/benchmark.h>


#include "barretenberg/common/google_bb_bench.hpp"


using namespace benchmark;


using Curve = bb::curve::BN254;

using Fr = Curve::ScalarField;

using G1 = Curve::AffineElement;


namespace {


constexpr size_t MSM_SIZE = 1 << 20;


enum class Distribution { Clustered, UniformMixed, AllFull };


class ThreadScalingBench : public benchmark::Fixture {

  public:

    std::shared_ptr<bb::srs::factories::Crs<Curve>> srs;

    bb::numeric::RNG& engine = bb::numeric::get_debug_randomness();


    void SetUp([[maybe_unused]] const ::benchmark::State& state) override

    {

        if (srs) {

            return;

        }

        bb::srs::init_file_crs_factory(bb::srs::bb_crs_path());

        srs = bb::srs::get_crs_factory<Curve>()->get_crs(MSM_SIZE);

    }


    // 32-bit "small" value -- mimics witness indices, booleans, limbs.

    // On BN254 (254-bit field) with ~14 bits per Pippenger slice, only the lowest

    // ~2-3 rounds produce nonzero slices for these scalars; the rest get filtered.

    Fr small_scalar() { return Fr(static_cast<uint64_t>(engine.get_random_uint32())); }

    Fr full_scalar() { return Fr::random_element(&engine); }


    std::vector<Fr> build_scalars(Distribution dist)

    {

        std::vector<Fr> scalars(MSM_SIZE);

        switch (dist) {

        case Distribution::Clustered:

            for (size_t i = 0; i < MSM_SIZE / 2; ++i) {

                scalars[i] = small_scalar();

            }

            for (size_t i = MSM_SIZE / 2; i < MSM_SIZE; ++i) {

                scalars[i] = full_scalar();

            }

            break;

        case Distribution::UniformMixed:

            for (size_t i = 0; i < MSM_SIZE; ++i) {

                scalars[i] = (engine.get_random_uint32() & 1U) ? small_scalar() : full_scalar();

            }

            break;

        case Distribution::AllFull:

            for (size_t i = 0; i < MSM_SIZE; ++i) {

                scalars[i] = full_scalar();

            }

            break;

        }

        return scalars;

    }

};


static void run_msm(ThreadScalingBench& fx, benchmark::State& state, Distribution dist)

{

    const size_t num_threads = static_cast<size_t>(state.range(0));


    // Rebuild per-invocation of the bench is fine: scalars get mutated (Montgomery

    // round-trip) inside batch_multi_scalar_mul, and we want consistent input across iterations.

    std::vector<Fr> scalars = fx.build_scalars(dist);


    std::vector<std::span<Fr>> scalar_spans;

    std::vector<std::span<const G1>> point_spans;

    scalar_spans.emplace_back(scalars);

    point_spans.emplace_back(fx.srs->get_monomial_points().subspan(0, MSM_SIZE));


    const size_t original_concurrency = bb::get_num_cpus();

    bb::set_parallel_for_concurrency(num_threads);


    for (auto _ : state) {

        GOOGLE_BB_BENCH_REPORTER(state);

        bb::scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(point_spans, scalar_spans, false);

    }


    bb::set_parallel_for_concurrency(original_concurrency);

}


BENCHMARK_DEFINE_F(ThreadScalingBench, Clustered)(benchmark::State& state)

{

    run_msm(*this, state, Distribution::Clustered);

}

BENCHMARK_DEFINE_F(ThreadScalingBench, UniformMixed)(benchmark::State& state)

{

    run_msm(*this, state, Distribution::UniformMixed);

}

BENCHMARK_DEFINE_F(ThreadScalingBench, AllFull)(benchmark::State& state)

{

    run_msm(*this, state, Distribution::AllFull);

}


static void ThreadSweep(benchmark::internal::Benchmark* b)

{

    for (int64_t t : { 1, 2, 4, 8 }) {

        b->Arg(t);

    }

}


BENCHMARK_REGISTER_F(ThreadScalingBench, Clustered)->Unit(benchmark::kMillisecond)->Apply(ThreadSweep);

BENCHMARK_REGISTER_F(ThreadScalingBench, UniformMixed)->Unit(benchmark::kMillisecond)->Apply(ThreadSweep);

BENCHMARK_REGISTER_F(ThreadScalingBench, AllFull)->Unit(benchmark::kMillisecond)->Apply(ThreadSweep);


} // namespace


BENCHMARK_MAIN();

bb::curve::BN254
Definition bn254.hpp:16

bb::curve::BN254::AffineElement
typename Group::affine_element AffineElement
Definition bn254.hpp:22

bb::curve::BN254::ScalarField
bb::fr ScalarField
Definition bn254.hpp:18

bb::numeric::RNG
Definition engine.hpp:17

bb::scalar_multiplication::MSM::batch_multi_scalar_mul
static std::vector< AffineElement > batch_multi_scalar_mul(std::span< std::span< const AffineElement > > points, std::span< std::span< ScalarField > > scalars, bool handle_edge_cases=true) noexcept
Compute multiple MSMs in parallel with work balancing.
Definition scalar_multiplication.cpp:497

b
FF b
Definition field_gt.test.cpp:53

bn254.hpp

engine.hpp

global_crs.hpp

google_bb_bench.hpp

GOOGLE_BB_BENCH_REPORTER
#define GOOGLE_BB_BENCH_REPORTER(state)
Definition google_bb_bench.hpp:53

bb::numeric::get_debug_randomness
RNG & get_debug_randomness(bool reset, std::uint_fast64_t seed)
Definition engine.cpp:245

bb::srs::bb_crs_path
std::filesystem::path bb_crs_path()
Definition global_crs.cpp:14

bb::srs::init_file_crs_factory
void init_file_crs_factory(const std::filesystem::path &path)
Definition global_crs.hpp:14

bb::get_num_cpus
size_t get_num_cpus()
Definition thread.cpp:33

bb::set_parallel_for_concurrency
void set_parallel_for_concurrency(size_t num_cores)
Definition thread.cpp:23

std::get
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13

G1
Curve::AffineElement G1
Definition pippenger.bench.cpp:22

scalar_multiplication.hpp

bb::field< Bn254FrParams >

bb::field< Bn254FrParams >::random_element
static field random_element(numeric::RNG *engine=nullptr) noexcept
Definition field_impl.hpp:777

thread.hpp

BENCHMARK_MAIN
BENCHMARK_MAIN()

Fr
Curve::ScalarField Fr
Definition thread_scaling.bench.cpp:33