gpu/gpu_8h_source.html

// Copyright (c) 2024-2026 Lux Industries Inc.

// SPDX-License-Identifier: BSD-3-Clause-Eco

//

// Lux GPU - Unified GPU acceleration with switchable backends

//

// Backends:

//   - Metal: Apple Silicon (macOS/iOS)

//   - CUDA: NVIDIA GPUs

//   - Dawn: WebGPU via Dawn (cross-platform)

//   - CPU: SIMD-optimized fallback

//

// Usage:

//   #include <lux/gpu.h>

//

//   LuxGPU* gpu = lux_gpu_create();

//   lux_gpu_set_backend(gpu, LUX_BACKEND_METAL);

//

//   LuxTensor* a = lux_tensor_zeros(gpu, shape, 2, LUX_FLOAT32);

//   LuxTensor* b = lux_tensor_ones(gpu, shape, 2, LUX_FLOAT32);

//   LuxTensor* c = lux_tensor_add(gpu, a, b);

//

//   lux_gpu_sync(gpu);

//   lux_gpu_destroy(gpu);


#ifndef LUX_GPU_H

#define LUX_GPU_H


#include <stddef.h>

#include <stdint.h>

#include <stdbool.h>


#ifdef __cplusplus

extern "C" {

#endif


// =============================================================================

// Version

// =============================================================================


#define LUX_GPU_VERSION_MAJOR 0

#define LUX_GPU_VERSION_MINOR 2

#define LUX_GPU_VERSION_PATCH 0


// =============================================================================

// Backend Types

// =============================================================================


typedef enum {

    LUX_BACKEND_AUTO = 0,  // Auto-detect best backend

    LUX_BACKEND_CPU  = 1,  // CPU with SIMD

    LUX_BACKEND_METAL = 2, // Apple Metal

    LUX_BACKEND_CUDA = 3,  // NVIDIA CUDA

    LUX_BACKEND_DAWN = 4,  // WebGPU via Dawn

} LuxBackend;


typedef enum {

    LUX_FLOAT32 = 0,

    LUX_FLOAT16 = 1,

    LUX_BFLOAT16 = 2,

    LUX_INT32 = 3,

    LUX_INT64 = 4,

    LUX_UINT32 = 5,

    LUX_UINT64 = 6,

    LUX_BOOL = 7,

} LuxDtype;


typedef enum {

    LUX_OK = 0,

    LUX_ERROR_INVALID_ARGUMENT = 1,

    LUX_ERROR_OUT_OF_MEMORY = 2,

    LUX_ERROR_BACKEND_NOT_AVAILABLE = 3,

    LUX_ERROR_DEVICE_NOT_FOUND = 4,

    LUX_ERROR_KERNEL_FAILED = 5,

    LUX_ERROR_NOT_SUPPORTED = 6,

} LuxError;


// =============================================================================

// Curve Types (for crypto operations)

// =============================================================================


typedef enum {

    LUX_CURVE_BLS12_381 = 0,

    LUX_CURVE_BN254 = 1,

    LUX_CURVE_SECP256K1 = 2,

    LUX_CURVE_ED25519 = 3,

} LuxCurve;


// =============================================================================

// Opaque Types

// =============================================================================


typedef struct LuxGPU LuxGPU;

typedef struct LuxTensor LuxTensor;

typedef struct LuxStream LuxStream;

typedef struct LuxEvent LuxEvent;


// =============================================================================

// Device Info

// =============================================================================


typedef struct {

    LuxBackend backend;

    int index;

    const char* name;

    const char* vendor;

    uint64_t memory_total;

    uint64_t memory_available;

    bool is_discrete;

    bool is_unified_memory;

    int compute_units;

    int max_workgroup_size;

} LuxDeviceInfo;


// =============================================================================

// GPU Context

// =============================================================================


// Create GPU context (auto-detects best backend)

LuxGPU* lux_gpu_create(void);


// Create GPU context with specific backend

LuxGPU* lux_gpu_create_with_backend(LuxBackend backend);


// Create GPU context with specific device

LuxGPU* lux_gpu_create_with_device(LuxBackend backend, int device_index);


// Destroy GPU context

void lux_gpu_destroy(LuxGPU* gpu);


// Get current backend

LuxBackend lux_gpu_backend(LuxGPU* gpu);


// Get backend name string

const char* lux_gpu_backend_name(LuxGPU* gpu);


// Switch backend at runtime. Returns LUX_ERROR_INVALID_ARGUMENT if any

// LuxTensor created against the current backend is still alive — destroy

// outstanding tensors before swapping backends. Returns

// LUX_ERROR_BACKEND_NOT_AVAILABLE if the target backend isn't loadable.

LuxError lux_gpu_set_backend(LuxGPU* gpu, LuxBackend backend);


// Get device info

LuxError lux_gpu_device_info(LuxGPU* gpu, LuxDeviceInfo* info);


// Synchronize all operations

LuxError lux_gpu_sync(LuxGPU* gpu);


// Get last error message

const char* lux_gpu_error(LuxGPU* gpu);


// =============================================================================

// Backend Query

// =============================================================================


// Get number of available backends

int lux_backend_count(void);


// Check if backend is available

bool lux_backend_available(LuxBackend backend);


// Get backend name

const char* lux_backend_name(LuxBackend backend);


// Get number of devices for backend

int lux_device_count(LuxBackend backend);


// Get device info for backend/index

LuxError lux_device_info(LuxBackend backend, int index, LuxDeviceInfo* info);


// =============================================================================

// Tensor Operations

// =============================================================================


// Create tensor filled with zeros

LuxTensor* lux_tensor_zeros(LuxGPU* gpu, const int64_t* shape, int ndim, LuxDtype dtype);


// Create tensor filled with ones

LuxTensor* lux_tensor_ones(LuxGPU* gpu, const int64_t* shape, int ndim, LuxDtype dtype);


// Create tensor filled with value

LuxTensor* lux_tensor_full(LuxGPU* gpu, const int64_t* shape, int ndim, LuxDtype dtype, double value);


// Create tensor from data

LuxTensor* lux_tensor_from_data(LuxGPU* gpu, const void* data, const int64_t* shape, int ndim, LuxDtype dtype);


// Destroy tensor

void lux_tensor_destroy(LuxTensor* tensor);


// Get tensor shape

int lux_tensor_ndim(LuxTensor* tensor);

int64_t lux_tensor_shape(LuxTensor* tensor, int dim);

int64_t lux_tensor_size(LuxTensor* tensor);

LuxDtype lux_tensor_dtype(LuxTensor* tensor);


// Copy tensor data to host

LuxError lux_tensor_to_host(LuxTensor* tensor, void* data, size_t size);


// Arithmetic operations

LuxTensor* lux_tensor_add(LuxGPU* gpu, LuxTensor* a, LuxTensor* b);

LuxTensor* lux_tensor_sub(LuxGPU* gpu, LuxTensor* a, LuxTensor* b);

LuxTensor* lux_tensor_mul(LuxGPU* gpu, LuxTensor* a, LuxTensor* b);

LuxTensor* lux_tensor_div(LuxGPU* gpu, LuxTensor* a, LuxTensor* b);

LuxTensor* lux_tensor_matmul(LuxGPU* gpu, LuxTensor* a, LuxTensor* b);


// Unary operations

LuxTensor* lux_tensor_neg(LuxGPU* gpu, LuxTensor* t);

LuxTensor* lux_tensor_exp(LuxGPU* gpu, LuxTensor* t);

LuxTensor* lux_tensor_log(LuxGPU* gpu, LuxTensor* t);

LuxTensor* lux_tensor_sqrt(LuxGPU* gpu, LuxTensor* t);

LuxTensor* lux_tensor_abs(LuxGPU* gpu, LuxTensor* t);

LuxTensor* lux_tensor_tanh(LuxGPU* gpu, LuxTensor* t);

LuxTensor* lux_tensor_sigmoid(LuxGPU* gpu, LuxTensor* t);

LuxTensor* lux_tensor_relu(LuxGPU* gpu, LuxTensor* t);

LuxTensor* lux_tensor_gelu(LuxGPU* gpu, LuxTensor* t);


// Reductions (full tensor -> scalar)

float lux_tensor_reduce_sum(LuxGPU* gpu, LuxTensor* t);

float lux_tensor_reduce_max(LuxGPU* gpu, LuxTensor* t);

float lux_tensor_reduce_min(LuxGPU* gpu, LuxTensor* t);

float lux_tensor_reduce_mean(LuxGPU* gpu, LuxTensor* t);


// Reductions along axes

LuxTensor* lux_tensor_sum(LuxGPU* gpu, LuxTensor* t, const int* axes, int naxes);

LuxTensor* lux_tensor_mean(LuxGPU* gpu, LuxTensor* t, const int* axes, int naxes);

LuxTensor* lux_tensor_max(LuxGPU* gpu, LuxTensor* t, const int* axes, int naxes);

LuxTensor* lux_tensor_min(LuxGPU* gpu, LuxTensor* t, const int* axes, int naxes);


// Softmax and normalization

LuxTensor* lux_tensor_softmax(LuxGPU* gpu, LuxTensor* t, int axis);

LuxTensor* lux_tensor_log_softmax(LuxGPU* gpu, LuxTensor* t, int axis);

LuxTensor* lux_tensor_layer_norm(LuxGPU* gpu, LuxTensor* t, LuxTensor* gamma, LuxTensor* beta, float eps);

LuxTensor* lux_tensor_rms_norm(LuxGPU* gpu, LuxTensor* t, LuxTensor* weight, float eps);


// Transpose and copy

LuxTensor* lux_tensor_transpose(LuxGPU* gpu, LuxTensor* t);

LuxTensor* lux_tensor_copy(LuxGPU* gpu, LuxTensor* t);


// =============================================================================

// Crypto Operations: Hash Functions

// =============================================================================


// Poseidon2 hash (algebraic hash for ZK circuits)

LuxError lux_poseidon2_hash(LuxGPU* gpu,

                            const uint64_t* inputs,      // [num_hashes * rate]

                            uint64_t* outputs,           // [num_hashes]

                            size_t rate,                 // Poseidon rate parameter

                            size_t num_hashes);


// BLAKE3 hash (high-performance cryptographic hash)

LuxError lux_blake3_hash(LuxGPU* gpu,

                         const uint8_t* inputs,         // Concatenated inputs

                         uint8_t* outputs,              // [num_hashes * 32]

                         const size_t* input_lens,      // Length of each input

                         size_t num_hashes);


// Keccak-256 hash (Ethereum variant, NOT NIST SHA-3)

//   - Padding: 0x01 || 0x00...0x00 || 0x80 (Keccak, not SHA-3's 0x06)

//   - Output: 32 bytes per input

//   - Primary use: EVM state trie hashing, address derivation

LuxError lux_gpu_keccak256_batch(LuxGPU* gpu,

                                 const uint8_t* inputs,         // Concatenated inputs

                                 uint8_t* outputs,              // [num_inputs * 32]

                                 const size_t* input_lens,      // Length of each input

                                 size_t num_inputs);


// =============================================================================

// Crypto Operations: secp256k1 ECDSA Recovery (Ethereum ecrecover)

// =============================================================================


// Packed signature for ecrecover batch operations.

// Each entry: r[32] || s[32] || v[1] || pad[3] || msg_hash[32] || pad[28] = 128 bytes


typedef struct {

    uint8_t r[32];        // ECDSA r value (big-endian)

    uint8_t s[32];        // ECDSA s value (big-endian)

    uint8_t v;            // Recovery id (0 or 1)

    uint8_t _pad[3];      // Alignment padding

    uint8_t msg_hash[32]; // Message hash (big-endian)

    uint8_t _pad2[28];    // Pad to 128 bytes

} LuxEcrecoverInput;


// Output of ecrecover: recovered Ethereum address.


typedef struct {

    uint8_t address[20]; // Recovered address (or zeros on failure)

    uint8_t valid;       // 1 if recovery succeeded, 0 otherwise

    uint8_t _pad[11];    // Pad to 32 bytes

} LuxEcrecoverOutput;


// Batch secp256k1 ECDSA public key recovery → Ethereum address.

//

// For each signature (r, s, v, msg_hash):

//   1. Recover public key Q from the ECDSA signature

//   2. Compute address = keccak256(Q.x || Q.y)[12:]

//

// This is the EVM ecrecover precompile, batched for GPU parallelism.

// Each GPU thread processes one signature independently.

//

// =============================================================================

// Signature malleability — low-s vs high-s

// =============================================================================

// This batch accepts BOTH low-s (s ≤ n/2) and high-s (s > n/2) signatures.

// That matches the Ethereum 0x01 ecrecover precompile semantics — the

// precompile recovers an address from any (r, s, v) triple in range

// regardless of which side of n/2 s falls on. It does NOT match EIP-2's

// strict low-s rule that ethereum txpool / EIP-155 transactions enforce

// at the consensus layer above the precompile.

//

// Address recovery is unchanged by s-malleability: (r, s, v) and (r, n-s, v')

// produce the SAME recovered public key (and therefore the same address) up

// to a flip of the recovery-id parity. Callers that need to reject malleable

// signatures (EIP-2 enforcement, txpool admission, replay protection on

// non-precompile signature surfaces) MUST check `s <= n/2` separately —

// this function does not.

//

// Returns LUX_OK on success (individual failures are indicated by valid=0

// in the output; the batch call itself only fails on argument errors).

LuxError lux_gpu_ecrecover_batch(LuxGPU* gpu,

                                 const LuxEcrecoverInput* signatures,

                                 LuxEcrecoverOutput* addresses,

                                 size_t num_signatures);


// =============================================================================

// Crypto Operations: MSM (Multi-Scalar Multiplication)

// =============================================================================


LuxError lux_msm(LuxGPU* gpu,

                 const void* scalars,           // Scalar field elements

                 const void* points,            // Curve points (affine)

                 void* result,                  // Single output point

                 size_t count,                  // Number of scalar-point pairs

                 LuxCurve curve);               // Which curve to use


// =============================================================================

// Crypto Operations: BLS12-381 Curve

// =============================================================================


// Point addition (G1 or G2)

LuxError lux_bls12_381_add(LuxGPU* gpu,

                           const void* a, const void* b, void* out,

                           size_t count, bool is_g2);


// Scalar multiplication (G1 or G2)

LuxError lux_bls12_381_mul(LuxGPU* gpu,

                           const void* points, const void* scalars, void* out,

                           size_t count, bool is_g2);


// Pairing computation (multi-pairing for efficiency)

LuxError lux_bls12_381_pairing(LuxGPU* gpu,

                               const void* g1_points, const void* g2_points,

                               void* out, size_t count);


// High-level BLS signature verification

LuxError lux_bls_verify(LuxGPU* gpu,

                        const uint8_t* sig, size_t sig_len,

                        const uint8_t* msg, size_t msg_len,

                        const uint8_t* pubkey, size_t pubkey_len,

                        bool* result);


LuxError lux_bls_verify_batch(LuxGPU* gpu,

                              const uint8_t* const* sigs, const size_t* sig_lens,

                              const uint8_t* const* msgs, const size_t* msg_lens,

                              const uint8_t* const* pubkeys, const size_t* pubkey_lens,

                              int count, bool* results);


LuxError lux_bls_aggregate(LuxGPU* gpu,

                           const uint8_t* const* sigs, const size_t* sig_lens,

                           int count, uint8_t* out, size_t* out_len);


// =============================================================================

// Crypto Operations: BN254 Curve

// =============================================================================


// Point addition (G1 or G2)

LuxError lux_bn254_add(LuxGPU* gpu,

                       const void* a, const void* b, void* out,

                       size_t count, bool is_g2);


// Scalar multiplication (G1 or G2)

LuxError lux_bn254_mul(LuxGPU* gpu,

                       const void* points, const void* scalars, void* out,

                       size_t count, bool is_g2);


// =============================================================================

// Crypto Operations: KZG Polynomial Commitments

// =============================================================================


// Commit to polynomial using SRS

LuxError lux_kzg_commit(LuxGPU* gpu,

                        const void* coeffs,        // Polynomial coefficients

                        const void* srs,           // SRS G1 points

                        void* commitment,          // Output commitment

                        size_t degree,             // Polynomial degree

                        LuxCurve curve);


// Open commitment at evaluation point

LuxError lux_kzg_open(LuxGPU* gpu,

                      const void* coeffs,          // Polynomial coefficients

                      const void* srs,             // SRS G1 points

                      const void* point,           // Evaluation point

                      void* proof,                 // Output proof

                      size_t degree,               // Polynomial degree

                      LuxCurve curve);


// Verify KZG opening proof

LuxError lux_kzg_verify(LuxGPU* gpu,

                        const void* commitment,    // Commitment point

                        const void* proof,         // Proof point

                        const void* point,         // Evaluation point

                        const void* value,         // Claimed evaluation

                        const void* srs_g2,        // G2 element from SRS

                        bool* result,              // Verification result

                        LuxCurve curve);


// =============================================================================

// FHE Operations: NTT (Number Theoretic Transform)

// =============================================================================


LuxError lux_ntt_forward(LuxGPU* gpu, uint64_t* data, size_t n, uint64_t modulus);

LuxError lux_ntt_inverse(LuxGPU* gpu, uint64_t* data, size_t n, uint64_t modulus);

LuxError lux_ntt_batch(LuxGPU* gpu, uint64_t** polys, size_t count, size_t n, uint64_t modulus);


// =============================================================================

// FHE Operations: Polynomial Arithmetic

// =============================================================================


// Polynomial multiplication: result = a * b mod (X^n + 1) mod modulus

LuxError lux_poly_mul(LuxGPU* gpu,

                      const uint64_t* a, const uint64_t* b,

                      uint64_t* result, size_t n, uint64_t modulus);


// =============================================================================

// FHE Operations: TFHE

// =============================================================================


// TFHE programmable bootstrap: evaluates the LUT encoded in test_poly on the

// encrypted input. BSK shape: [n_lwe][(k+1)*l][k+1][N] u64. lwe_out length

// is k*N + 1; k = 0 is rejected as INVALID_ARGUMENT. The gadget contract

// B = 2^base_log requires l * base_log <= log2(q); otherwise the bottom

// gadget level collapses to zero.

LuxError lux_tfhe_bootstrap(LuxGPU* gpu,

                            const uint64_t* lwe_in,       // Input LWE [n_lwe + 1]

                            uint64_t* lwe_out,            // Output LWE [k*N + 1]

                            const uint64_t* bsk,          // Bootstrapping key

                            const uint64_t* test_poly,    // Test polynomial (LUT)

                            uint32_t n_lwe,               // Input LWE dimension

                            uint32_t N,                   // GLWE polynomial degree (power of two)

                            uint32_t k,                   // GLWE dimension (>= 1)

                            uint32_t l,                   // Decomposition levels

                            uint32_t base_log,            // Bits per gadget digit (B = 2^base_log)

                            uint64_t q);                  // Modulus


// TFHE key switching: changes LWE key. KSK rows encode an LWE encryption

// (under the OUT key) of -s_{in_idx} * q / B^{level+1}, B = 2^base_log.

LuxError lux_tfhe_keyswitch(LuxGPU* gpu,

                            const uint64_t* lwe_in,       // Input LWE [n_in + 1]

                            uint64_t* lwe_out,            // Output LWE [n_out + 1]

                            const uint64_t* ksk,          // Key switching key

                            uint32_t n_in,                // Input dimension

                            uint32_t n_out,               // Output dimension

                            uint32_t l,                   // Decomposition levels

                            uint32_t base_log,            // Base log

                            uint64_t q);                  // Modulus


// Blind rotation: rotates polynomial accumulator by encrypted amount.

// Same BSK shape and gadget contract as lux_tfhe_bootstrap.

LuxError lux_blind_rotate(LuxGPU* gpu,

                          uint64_t* acc,                  // Accumulator GLWE [(k+1) * N]

                          const uint64_t* bsk,            // Bootstrapping key

                          const uint64_t* lwe_a,          // LWE 'a' coefficients [n_lwe]

                          uint32_t n_lwe,                 // LWE dimension

                          uint32_t N,                     // GLWE polynomial degree (power of two)

                          uint32_t k,                     // GLWE dimension (>= 1)

                          uint32_t l,                     // Decomposition levels

                          uint32_t base_log,              // Bits per gadget digit (B = 2^base_log)

                          uint64_t q);                    // Modulus


// =============================================================================

// FHE Helpers — small inspectors and parameter validators exposed so that

// callers can pre-check inputs before dispatching to the heavy ops above.

// Each helper is stateless and constant-time over its inputs.

// =============================================================================


// Returns true iff N is a power of two in (0, 2^20]. Matches the validation

// inside lux_tfhe_bootstrap / lux_blind_rotate.

bool lux_fhe_is_valid_N(uint32_t N);


// Returns true iff (l, base_log) satisfies the gadget contract on a 64-bit

// modulus q: l in [1, 64], base_log in [1, 64], l*base_log <= 64. The bottom

// gadget level q / B^l collapses to zero when l*base_log >= 64.

bool lux_fhe_is_valid_gadget(uint32_t l, uint32_t base_log);


// Returns true iff the full TFHE-AP PBS parameter set is well-formed:

// k >= 1, N a power of two in (0, 2^20], gadget contract holds, q != 0.

bool lux_fhe_is_valid_pbs(uint32_t n_lwe, uint32_t N, uint32_t k,

                          uint32_t l, uint32_t base_log, uint64_t q);


// Total BSK length in u64 words: n_lwe * (k+1) * l * (k+1) * N.

// Returns 0 if the parameters fail lux_fhe_is_valid_pbs.

size_t lux_fhe_bsk_words(uint32_t n_lwe, uint32_t N, uint32_t k, uint32_t l);


// KSK length in u64 words: n_in * l * (n_out + 1).

// Returns 0 if any of (n_in, n_out, l, base_log) is degenerate.

size_t lux_fhe_ksk_words(uint32_t n_in, uint32_t n_out,

                          uint32_t l, uint32_t base_log);


// Output LWE length for sample-extract: k*N + 1.

// Returns 0 if k = 0 or N is not a power of two.

size_t lux_fhe_lwe_out_words(uint32_t N, uint32_t k);


// Accumulator GLWE length: (k+1) * N.

// Returns 0 if k = 0 or N is not a power of two.

size_t lux_fhe_acc_words(uint32_t N, uint32_t k);


// Suggested base_log for a given l on a 64-bit modulus q: floor(log2(q) / l)

// clamped to [1, 64]. Returns 0 when l = 0 or q = 0.

uint32_t lux_fhe_suggest_base_log(uint32_t l, uint64_t q);


// Signed-digit decomposition of one u64 value at gadget level under base_log.

// Returns the centered digit (range [-2^{base_log-1}, 2^{base_log-1})).

// Helper exposed so callers can build BSK / KSK fixtures with the exact

// gadget encoding the bootstrap consumes.

//

// IMPORTANT: this is the legacy per-digit form. It extracts each digit

// INDEPENDENTLY (no carry propagation), so for typical inputs the per-digit

// residue can be as large as q/B. For numerically-stable PBS / keyswitch use

// lux_fhe_signed_decomp_all below, which carry-propagates and bounds the

// total residue at q/B^l.

int64_t lux_fhe_signed_decomp_digit(uint64_t value,

                                     uint32_t level, uint32_t base_log);


// Carry-propagating signed-radix decomposition. Writes l digits into out[],

// top-down: out[0] has gadget weight q/B (largest), out[l-1] has weight

// q/B^l (smallest). |out[lvl]| <= B/2. Aggregate approximation error is

// bounded by q/B^l, matching OpenFHE's SignedDigitDecompose. Returns false

// (and leaves out[] unchanged) if (l, base_log) fail lux_fhe_is_valid_gadget.

bool lux_fhe_signed_decomp_all(uint64_t value, uint32_t l, uint32_t base_log,

                                int64_t* out);


// Compute a_tilde = round(a * 2N / q) mod 2N (the rotation amount used by the

// blind-rotation loop). Exposes the exact rounding the implementation uses.

uint32_t lux_fhe_compute_a_tilde(uint64_t a, uint32_t N, uint64_t q);


// Encode plaintext m ∈ [0, modulus) as Δ · m mod q where Δ = q / modulus.

// Test-vector helper to keep callers from re-implementing the same encoding

// inconsistently across the test suite.

uint64_t lux_fhe_encode_message(uint64_t m, uint64_t modulus, uint64_t q);


// Decode an LWE phase to its nearest multiple of Δ = q / modulus, returning

// the message in [0, modulus). Inverse of lux_fhe_encode_message under noise

// up to Δ/2.

uint64_t lux_fhe_decode_phase(uint64_t phase, uint64_t modulus, uint64_t q);


// Returns the canonical "plateau" test polynomial used by the FHE test

// suite: coefficients in [N/4, 3N/4) take value Delta, all other slots are

// zero. Writes N words into out. Returns false if N is not a valid PBS N.

// Use to drive bootstrap correctness tests with a wide noise margin.

bool lux_fhe_test_poly_plateau(uint32_t N, uint64_t Delta, uint64_t* out);


// Returns the canonical "half-plane" test polynomial: 0 for i < N/2, Delta

// for i >= N/2. Conventional TFHE LUT for a single bit; sensitive to

// noise at the N/2 boundary.

bool lux_fhe_test_poly_half(uint32_t N, uint64_t Delta, uint64_t* out);


// Returns gadget[lvl] = q >> ((lvl+1) * base_log). Convenience that mirrors

// the bootstrap and keyswitch internal gadget construction exactly.

// Returns 0 on invalid (l, base_log) — note this matches the contract that

// the bottom gadget collapses to 0 when l*base_log >= 64.

uint64_t lux_fhe_gadget_value(uint32_t level, uint32_t base_log, uint64_t q);


// Reconstruct val ≈ Σ digit[lvl] · gadget[lvl] mod q, in canonical Z_q.

// `digits` is l-long, top-down, signed. Returns 0 on validation failure.

// The reconstruction error |reconstructed - val| is bounded by q/B^l for

// digits returned from lux_fhe_signed_decomp_all.

uint64_t lux_fhe_gadget_reconstruct(const int64_t* digits, uint32_t l,

                                     uint32_t base_log, uint64_t q);


// Total decomp scratch length in i64 words: (k+1) * l * N. Mirrors the

// internal scratch the blind-rotation step allocates per AP iteration.

// Returns 0 on validation failure.

size_t lux_fhe_decomp_words(uint32_t N, uint32_t k, uint32_t l);


// Validate KSK-shape parameters independently of PBS-shape parameters.

// Returns true iff (n_in, n_out, l, base_log) is well-formed and q != 0.

bool lux_fhe_is_valid_keyswitch(uint32_t n_in, uint32_t n_out, uint32_t l,

                                 uint32_t base_log, uint64_t q);


// Convenience: returns the LWE-input length for keyswitch = n_in + 1, or 0.

size_t lux_fhe_keyswitch_in_words(uint32_t n_in);


// Convenience: returns the LWE-output length for keyswitch = n_out + 1, or 0.

size_t lux_fhe_keyswitch_out_words(uint32_t n_out);


// Returns the current FHE ABI revision so callers can refuse to load a

// plugin compiled against an older bootstrap signature.

uint32_t lux_fhe_abi_revision(void);


// =============================================================================

// ZK Primitives: Field Elements and High-Level Operations

// =============================================================================


// BN254 scalar field element (Fr) - 256-bit integer in 4 x 64-bit limbs

// Represents elements of the scalar field of BN254 curve


typedef struct {

    uint64_t limbs[4];

} LuxFr256;


// Poseidon2 compression: out[i] = Poseidon2(left[i], right[i])

// Poseidon2 is an algebraic hash function optimized for ZK circuits.

LuxError lux_gpu_poseidon2(LuxGPU* gpu,

                           LuxFr256* out,

                           const LuxFr256* left,

                           const LuxFr256* right,

                           size_t n);


// Merkle tree root computation using Poseidon2 hash

// Computes root from n leaves (pads to next power of 2 internally)

LuxError lux_gpu_merkle_root(LuxGPU* gpu,

                             LuxFr256* out,

                             const LuxFr256* leaves,

                             size_t n);


// Pedersen-style commitment: out[i] = Poseidon2(Poseidon2(value, blinding), salt)

// Suitable for hiding commitments in ZK protocols

LuxError lux_gpu_commitment(LuxGPU* gpu,

                            LuxFr256* out,

                            const LuxFr256* values,

                            const LuxFr256* blindings,

                            const LuxFr256* salts,

                            size_t n);


// Nullifier derivation: out[i] = Poseidon2(Poseidon2(key, commitment), index)

// Used to prevent double-spending in ZK protocols

LuxError lux_gpu_nullifier(LuxGPU* gpu,

                           LuxFr256* out,

                           const LuxFr256* keys,

                           const LuxFr256* commitments,

                           const LuxFr256* indices,

                           size_t n);


// =============================================================================

// Crypto Operations: Post-Quantum Signatures

// =============================================================================


// ML-DSA-65 (FIPS 204, CRYSTALS-Dilithium) batch signature verification

// pubkeys: array of public keys (1952 bytes each)

// messages: array of message hashes (64 bytes each)

// signatures: array of signatures (3360 bytes each, padded)

// results: output boolean array (1=valid, 0=invalid)

LuxError lux_gpu_mldsa_verify_batch(LuxGPU* gpu,

                                    const uint8_t* const* pubkeys,

                                    const uint8_t* const* messages,

                                    const uint8_t* const* signatures,

                                    bool* results,

                                    size_t count);


// ML-KEM-768 (FIPS 203, CRYSTALS-Kyber) batch decapsulation

// secret_keys: array of decapsulation keys (2400 bytes each)

// ciphertexts: array of ciphertexts (1088 bytes each)

// shared_secrets: output array of shared secrets (32 bytes each)

LuxError lux_gpu_mlkem_decapsulate_batch(LuxGPU* gpu,

                                         const uint8_t* const* secret_keys,

                                         const uint8_t* const* ciphertexts,

                                         uint8_t** shared_secrets,

                                         size_t count);


// SLH-DSA (FIPS 205, SPHINCS+) batch signature verification

// pubkeys: array of public keys (32 bytes each for SHAKE-128f)

// messages: array of message hashes (32 bytes each)

// signatures: array of signatures (up to 17088 bytes each)

LuxError lux_gpu_slhdsa_verify_batch(LuxGPU* gpu,

                                     const uint8_t* const* pubkeys,

                                     const uint8_t* const* messages,

                                     const uint8_t* const* signatures,

                                     bool* results,

                                     size_t count);


// =============================================================================

// Crypto Operations: Threshold Signatures

// =============================================================================


// Ringtail lattice-based threshold partial signing

// shares: array of secret shares (1024 bytes each, 256 int32 coefficients)

// messages: array of message hashes (32 bytes each)

// partial_sigs: output partial signatures (1024 bytes each)

LuxError lux_gpu_ringtail_partial_sign_batch(LuxGPU* gpu,

                                             const uint8_t* const* shares,

                                             const uint8_t* const* messages,

                                             uint8_t** partial_sigs,

                                             size_t count);


// Ringtail threshold combine: merge k partial sigs into one

// partial_sigs: array of partial signatures [count * threshold]

// lagrange_coeffs: Lagrange interpolation coefficients [count * threshold]

// combined_sigs: output combined signatures [count]

LuxError lux_gpu_ringtail_combine_batch(LuxGPU* gpu,

                                        const uint8_t* const* partial_sigs,

                                        const int32_t* lagrange_coeffs,

                                        uint8_t** combined_sigs,

                                        size_t threshold,

                                        size_t count);


// FROST threshold Schnorr partial signature verification

// commitments: participant commitments (66 bytes each)

// signatures: partial signature scalars (32 bytes each)

// pubkeys: public key shares (33 bytes each)

// challenges: pre-computed c*lambda_i scalars (32 bytes each)

LuxError lux_gpu_frost_partial_verify_batch(LuxGPU* gpu,

                                            const uint8_t* const* commitments,

                                            const uint8_t* const* signatures,

                                            const uint8_t* const* pubkeys,

                                            const uint8_t* const* challenges,

                                            bool* results,

                                            size_t count);


// CGGMP21 threshold ECDSA partial signing

// inputs: k_share[32] || chi_share[32] || msg_hash[32] || gamma_share[32] per entry

// r_x: x-coordinate of combined nonce R (32 bytes)

// partial_sigs: output sigma_i values (32 bytes each)

LuxError lux_gpu_cggmp21_partial_sign_batch(LuxGPU* gpu,

                                            const uint8_t* const* inputs,

                                            const uint8_t* r_x,

                                            uint8_t** partial_sigs,

                                            size_t count);


// =============================================================================

// Crypto Operations: Ed25519 / sr25519

// =============================================================================


// Ed25519 batch signature verification

// pubkeys: 32-byte compressed points

// messages: 64-byte pre-computed H(R||A||M), reduced mod L by host

// signatures: 64-byte signatures (R[32] || S[32])

LuxError lux_gpu_ed25519_verify_batch(LuxGPU* gpu,

                                      const uint8_t* const* pubkeys,

                                      const uint8_t* const* messages,

                                      const uint8_t* const* signatures,

                                      bool* results,

                                      size_t count);


// sr25519 (Schnorrkel/Ristretto255) batch signature verification

// pubkeys: 32-byte Ristretto255 compressed points

// messages: 64-byte pre-computed transcript hashes

// signatures: 64-byte signatures (R[32] || s[32])

LuxError lux_gpu_sr25519_verify_batch(LuxGPU* gpu,

                                      const uint8_t* const* pubkeys,

                                      const uint8_t* const* messages,

                                      const uint8_t* const* signatures,

                                      bool* results,

                                      size_t count);


// =============================================================================

// Stream/Event Management

// =============================================================================


LuxStream* lux_stream_create(LuxGPU* gpu);

void lux_stream_destroy(LuxStream* stream);

LuxError lux_stream_sync(LuxStream* stream);


LuxEvent* lux_event_create(LuxGPU* gpu);

void lux_event_destroy(LuxEvent* event);

LuxError lux_event_record(LuxEvent* event, LuxStream* stream);

LuxError lux_event_wait(LuxEvent* event, LuxStream* stream);

float lux_event_elapsed(LuxEvent* start, LuxEvent* end);


#ifdef __cplusplus

}

#endif


#endif // LUX_GPU_H

lux_tensor_to_host
LuxError lux_tensor_to_host(LuxTensor *tensor, void *data, size_t size)

lux_fhe_gadget_reconstruct
uint64_t lux_fhe_gadget_reconstruct(const int64_t *digits, uint32_t l, uint32_t base_log, uint64_t q)

lux_ntt_batch
LuxError lux_ntt_batch(LuxGPU *gpu, uint64_t **polys, size_t count, size_t n, uint64_t modulus)

lux_fhe_bsk_words
size_t lux_fhe_bsk_words(uint32_t n_lwe, uint32_t N, uint32_t k, uint32_t l)

lux_fhe_encode_message
uint64_t lux_fhe_encode_message(uint64_t m, uint64_t modulus, uint64_t q)

lux_tensor_add
LuxTensor * lux_tensor_add(LuxGPU *gpu, LuxTensor *a, LuxTensor *b)

lux_gpu_destroy
void lux_gpu_destroy(LuxGPU *gpu)

lux_tensor_gelu
LuxTensor * lux_tensor_gelu(LuxGPU *gpu, LuxTensor *t)

lux_tensor_mean
LuxTensor * lux_tensor_mean(LuxGPU *gpu, LuxTensor *t, const int *axes, int naxes)

lux_device_count
int lux_device_count(LuxBackend backend)

lux_fhe_is_valid_gadget
bool lux_fhe_is_valid_gadget(uint32_t l, uint32_t base_log)

lux_gpu_keccak256_batch
LuxError lux_gpu_keccak256_batch(LuxGPU *gpu, const uint8_t *inputs, uint8_t *outputs, const size_t *input_lens, size_t num_inputs)

lux_event_destroy
void lux_event_destroy(LuxEvent *event)

LuxTensor
struct LuxTensor LuxTensor
Definition gpu.h:93

lux_tensor_dtype
LuxDtype lux_tensor_dtype(LuxTensor *tensor)

lux_tensor_rms_norm
LuxTensor * lux_tensor_rms_norm(LuxGPU *gpu, LuxTensor *t, LuxTensor *weight, float eps)

lux_device_info
LuxError lux_device_info(LuxBackend backend, int index, LuxDeviceInfo *info)

lux_gpu_error
const char * lux_gpu_error(LuxGPU *gpu)

lux_event_wait
LuxError lux_event_wait(LuxEvent *event, LuxStream *stream)

lux_event_record
LuxError lux_event_record(LuxEvent *event, LuxStream *stream)

lux_gpu_slhdsa_verify_batch
LuxError lux_gpu_slhdsa_verify_batch(LuxGPU *gpu, const uint8_t *const *pubkeys, const uint8_t *const *messages, const uint8_t *const *signatures, bool *results, size_t count)

lux_tensor_reduce_min
float lux_tensor_reduce_min(LuxGPU *gpu, LuxTensor *t)

lux_gpu_create
LuxGPU * lux_gpu_create(void)

lux_tensor_copy
LuxTensor * lux_tensor_copy(LuxGPU *gpu, LuxTensor *t)

lux_gpu_cggmp21_partial_sign_batch
LuxError lux_gpu_cggmp21_partial_sign_batch(LuxGPU *gpu, const uint8_t *const *inputs, const uint8_t *r_x, uint8_t **partial_sigs, size_t count)

lux_ntt_inverse
LuxError lux_ntt_inverse(LuxGPU *gpu, uint64_t *data, size_t n, uint64_t modulus)

lux_fhe_test_poly_half
bool lux_fhe_test_poly_half(uint32_t N, uint64_t Delta, uint64_t *out)

lux_tensor_reduce_mean
float lux_tensor_reduce_mean(LuxGPU *gpu, LuxTensor *t)

lux_gpu_sync
LuxError lux_gpu_sync(LuxGPU *gpu)

LuxStream
struct LuxStream LuxStream
Definition gpu.h:94

lux_gpu_backend_name
const char * lux_gpu_backend_name(LuxGPU *gpu)

lux_tensor_matmul
LuxTensor * lux_tensor_matmul(LuxGPU *gpu, LuxTensor *a, LuxTensor *b)

lux_fhe_test_poly_plateau
bool lux_fhe_test_poly_plateau(uint32_t N, uint64_t Delta, uint64_t *out)

lux_tensor_softmax
LuxTensor * lux_tensor_softmax(LuxGPU *gpu, LuxTensor *t, int axis)

lux_fhe_decomp_words
size_t lux_fhe_decomp_words(uint32_t N, uint32_t k, uint32_t l)

lux_fhe_abi_revision
uint32_t lux_fhe_abi_revision(void)

lux_tensor_sqrt
LuxTensor * lux_tensor_sqrt(LuxGPU *gpu, LuxTensor *t)

LuxDtype
LuxDtype
Definition gpu.h:56

LUX_UINT64
@ LUX_UINT64
Definition gpu.h:63

LUX_INT64
@ LUX_INT64
Definition gpu.h:61

LUX_BFLOAT16
@ LUX_BFLOAT16
Definition gpu.h:59

LUX_BOOL
@ LUX_BOOL
Definition gpu.h:64

LUX_UINT32
@ LUX_UINT32
Definition gpu.h:62

LUX_INT32
@ LUX_INT32
Definition gpu.h:60

LUX_FLOAT32
@ LUX_FLOAT32
Definition gpu.h:57

LUX_FLOAT16
@ LUX_FLOAT16
Definition gpu.h:58

lux_msm
LuxError lux_msm(LuxGPU *gpu, const void *scalars, const void *points, void *result, size_t count, LuxCurve curve)

lux_tensor_from_data
LuxTensor * lux_tensor_from_data(LuxGPU *gpu, const void *data, const int64_t *shape, int ndim, LuxDtype dtype)

lux_gpu_ecrecover_batch
LuxError lux_gpu_ecrecover_batch(LuxGPU *gpu, const LuxEcrecoverInput *signatures, LuxEcrecoverOutput *addresses, size_t num_signatures)

lux_gpu_mlkem_decapsulate_batch
LuxError lux_gpu_mlkem_decapsulate_batch(LuxGPU *gpu, const uint8_t *const *secret_keys, const uint8_t *const *ciphertexts, uint8_t **shared_secrets, size_t count)

lux_gpu_device_info
LuxError lux_gpu_device_info(LuxGPU *gpu, LuxDeviceInfo *info)

lux_tensor_log_softmax
LuxTensor * lux_tensor_log_softmax(LuxGPU *gpu, LuxTensor *t, int axis)

lux_fhe_compute_a_tilde
uint32_t lux_fhe_compute_a_tilde(uint64_t a, uint32_t N, uint64_t q)

lux_event_elapsed
float lux_event_elapsed(LuxEvent *start, LuxEvent *end)

lux_backend_name
const char * lux_backend_name(LuxBackend backend)

lux_tensor_shape
int64_t lux_tensor_shape(LuxTensor *tensor, int dim)

lux_tensor_zeros
LuxTensor * lux_tensor_zeros(LuxGPU *gpu, const int64_t *shape, int ndim, LuxDtype dtype)

lux_fhe_keyswitch_out_words
size_t lux_fhe_keyswitch_out_words(uint32_t n_out)

lux_bn254_add
LuxError lux_bn254_add(LuxGPU *gpu, const void *a, const void *b, void *out, size_t count, bool is_g2)

lux_fhe_is_valid_pbs
bool lux_fhe_is_valid_pbs(uint32_t n_lwe, uint32_t N, uint32_t k, uint32_t l, uint32_t base_log, uint64_t q)

lux_gpu_create_with_backend
LuxGPU * lux_gpu_create_with_backend(LuxBackend backend)

lux_gpu_nullifier
LuxError lux_gpu_nullifier(LuxGPU *gpu, LuxFr256 *out, const LuxFr256 *keys, const LuxFr256 *commitments, const LuxFr256 *indices, size_t n)

lux_tensor_size
int64_t lux_tensor_size(LuxTensor *tensor)

lux_bls12_381_add
LuxError lux_bls12_381_add(LuxGPU *gpu, const void *a, const void *b, void *out, size_t count, bool is_g2)

lux_poseidon2_hash
LuxError lux_poseidon2_hash(LuxGPU *gpu, const uint64_t *inputs, uint64_t *outputs, size_t rate, size_t num_hashes)

lux_tensor_destroy
void lux_tensor_destroy(LuxTensor *tensor)

lux_stream_sync
LuxError lux_stream_sync(LuxStream *stream)

lux_fhe_signed_decomp_all
bool lux_fhe_signed_decomp_all(uint64_t value, uint32_t l, uint32_t base_log, int64_t *out)

lux_bn254_mul
LuxError lux_bn254_mul(LuxGPU *gpu, const void *points, const void *scalars, void *out, size_t count, bool is_g2)

lux_fhe_lwe_out_words
size_t lux_fhe_lwe_out_words(uint32_t N, uint32_t k)

lux_fhe_keyswitch_in_words
size_t lux_fhe_keyswitch_in_words(uint32_t n_in)

lux_tensor_relu
LuxTensor * lux_tensor_relu(LuxGPU *gpu, LuxTensor *t)

lux_tensor_neg
LuxTensor * lux_tensor_neg(LuxGPU *gpu, LuxTensor *t)

lux_gpu_sr25519_verify_batch
LuxError lux_gpu_sr25519_verify_batch(LuxGPU *gpu, const uint8_t *const *pubkeys, const uint8_t *const *messages, const uint8_t *const *signatures, bool *results, size_t count)

lux_tensor_reduce_max
float lux_tensor_reduce_max(LuxGPU *gpu, LuxTensor *t)

lux_fhe_is_valid_keyswitch
bool lux_fhe_is_valid_keyswitch(uint32_t n_in, uint32_t n_out, uint32_t l, uint32_t base_log, uint64_t q)

lux_tensor_layer_norm
LuxTensor * lux_tensor_layer_norm(LuxGPU *gpu, LuxTensor *t, LuxTensor *gamma, LuxTensor *beta, float eps)

lux_fhe_acc_words
size_t lux_fhe_acc_words(uint32_t N, uint32_t k)

lux_tfhe_keyswitch
LuxError lux_tfhe_keyswitch(LuxGPU *gpu, const uint64_t *lwe_in, uint64_t *lwe_out, const uint64_t *ksk, uint32_t n_in, uint32_t n_out, uint32_t l, uint32_t base_log, uint64_t q)

LuxCurve
LuxCurve
Definition gpu.h:81

LUX_CURVE_BLS12_381
@ LUX_CURVE_BLS12_381
Definition gpu.h:82

LUX_CURVE_SECP256K1
@ LUX_CURVE_SECP256K1
Definition gpu.h:84

LUX_CURVE_BN254
@ LUX_CURVE_BN254
Definition gpu.h:83

LUX_CURVE_ED25519
@ LUX_CURVE_ED25519
Definition gpu.h:85

lux_kzg_open
LuxError lux_kzg_open(LuxGPU *gpu, const void *coeffs, const void *srs, const void *point, void *proof, size_t degree, LuxCurve curve)

lux_kzg_verify
LuxError lux_kzg_verify(LuxGPU *gpu, const void *commitment, const void *proof, const void *point, const void *value, const void *srs_g2, bool *result, LuxCurve curve)

lux_fhe_suggest_base_log
uint32_t lux_fhe_suggest_base_log(uint32_t l, uint64_t q)

lux_blake3_hash
LuxError lux_blake3_hash(LuxGPU *gpu, const uint8_t *inputs, uint8_t *outputs, const size_t *input_lens, size_t num_hashes)

LuxBackend
LuxBackend
Definition gpu.h:48

LUX_BACKEND_DAWN
@ LUX_BACKEND_DAWN
Definition gpu.h:53

LUX_BACKEND_AUTO
@ LUX_BACKEND_AUTO
Definition gpu.h:49

LUX_BACKEND_CUDA
@ LUX_BACKEND_CUDA
Definition gpu.h:52

LUX_BACKEND_CPU
@ LUX_BACKEND_CPU
Definition gpu.h:50

LUX_BACKEND_METAL
@ LUX_BACKEND_METAL
Definition gpu.h:51

lux_tensor_min
LuxTensor * lux_tensor_min(LuxGPU *gpu, LuxTensor *t, const int *axes, int naxes)

lux_tensor_transpose
LuxTensor * lux_tensor_transpose(LuxGPU *gpu, LuxTensor *t)

lux_tensor_sigmoid
LuxTensor * lux_tensor_sigmoid(LuxGPU *gpu, LuxTensor *t)

lux_tensor_sum
LuxTensor * lux_tensor_sum(LuxGPU *gpu, LuxTensor *t, const int *axes, int naxes)

lux_gpu_commitment
LuxError lux_gpu_commitment(LuxGPU *gpu, LuxFr256 *out, const LuxFr256 *values, const LuxFr256 *blindings, const LuxFr256 *salts, size_t n)

lux_tensor_exp
LuxTensor * lux_tensor_exp(LuxGPU *gpu, LuxTensor *t)

lux_tensor_ones
LuxTensor * lux_tensor_ones(LuxGPU *gpu, const int64_t *shape, int ndim, LuxDtype dtype)

lux_stream_destroy
void lux_stream_destroy(LuxStream *stream)

lux_kzg_commit
LuxError lux_kzg_commit(LuxGPU *gpu, const void *coeffs, const void *srs, void *commitment, size_t degree, LuxCurve curve)

lux_tensor_log
LuxTensor * lux_tensor_log(LuxGPU *gpu, LuxTensor *t)

lux_bls_verify
LuxError lux_bls_verify(LuxGPU *gpu, const uint8_t *sig, size_t sig_len, const uint8_t *msg, size_t msg_len, const uint8_t *pubkey, size_t pubkey_len, bool *result)

lux_gpu_frost_partial_verify_batch
LuxError lux_gpu_frost_partial_verify_batch(LuxGPU *gpu, const uint8_t *const *commitments, const uint8_t *const *signatures, const uint8_t *const *pubkeys, const uint8_t *const *challenges, bool *results, size_t count)

LuxGPU
struct LuxGPU LuxGPU
Definition gpu.h:92

lux_tensor_mul
LuxTensor * lux_tensor_mul(LuxGPU *gpu, LuxTensor *a, LuxTensor *b)

lux_fhe_ksk_words
size_t lux_fhe_ksk_words(uint32_t n_in, uint32_t n_out, uint32_t l, uint32_t base_log)

lux_event_create
LuxEvent * lux_event_create(LuxGPU *gpu)

lux_ntt_forward
LuxError lux_ntt_forward(LuxGPU *gpu, uint64_t *data, size_t n, uint64_t modulus)

lux_gpu_mldsa_verify_batch
LuxError lux_gpu_mldsa_verify_batch(LuxGPU *gpu, const uint8_t *const *pubkeys, const uint8_t *const *messages, const uint8_t *const *signatures, bool *results, size_t count)

lux_bls_aggregate
LuxError lux_bls_aggregate(LuxGPU *gpu, const uint8_t *const *sigs, const size_t *sig_lens, int count, uint8_t *out, size_t *out_len)

lux_tensor_reduce_sum
float lux_tensor_reduce_sum(LuxGPU *gpu, LuxTensor *t)

lux_gpu_backend
LuxBackend lux_gpu_backend(LuxGPU *gpu)

lux_gpu_set_backend
LuxError lux_gpu_set_backend(LuxGPU *gpu, LuxBackend backend)

lux_tensor_div
LuxTensor * lux_tensor_div(LuxGPU *gpu, LuxTensor *a, LuxTensor *b)

lux_gpu_merkle_root
LuxError lux_gpu_merkle_root(LuxGPU *gpu, LuxFr256 *out, const LuxFr256 *leaves, size_t n)

lux_gpu_poseidon2
LuxError lux_gpu_poseidon2(LuxGPU *gpu, LuxFr256 *out, const LuxFr256 *left, const LuxFr256 *right, size_t n)

lux_backend_count
int lux_backend_count(void)

lux_bls_verify_batch
LuxError lux_bls_verify_batch(LuxGPU *gpu, const uint8_t *const *sigs, const size_t *sig_lens, const uint8_t *const *msgs, const size_t *msg_lens, const uint8_t *const *pubkeys, const size_t *pubkey_lens, int count, bool *results)

LuxError
LuxError
Definition gpu.h:67

LUX_OK
@ LUX_OK
Definition gpu.h:68

LUX_ERROR_INVALID_ARGUMENT
@ LUX_ERROR_INVALID_ARGUMENT
Definition gpu.h:69

LUX_ERROR_NOT_SUPPORTED
@ LUX_ERROR_NOT_SUPPORTED
Definition gpu.h:74

LUX_ERROR_KERNEL_FAILED
@ LUX_ERROR_KERNEL_FAILED
Definition gpu.h:73

LUX_ERROR_DEVICE_NOT_FOUND
@ LUX_ERROR_DEVICE_NOT_FOUND
Definition gpu.h:72

LUX_ERROR_OUT_OF_MEMORY
@ LUX_ERROR_OUT_OF_MEMORY
Definition gpu.h:70

LUX_ERROR_BACKEND_NOT_AVAILABLE
@ LUX_ERROR_BACKEND_NOT_AVAILABLE
Definition gpu.h:71

lux_gpu_ed25519_verify_batch
LuxError lux_gpu_ed25519_verify_batch(LuxGPU *gpu, const uint8_t *const *pubkeys, const uint8_t *const *messages, const uint8_t *const *signatures, bool *results, size_t count)

lux_fhe_gadget_value
uint64_t lux_fhe_gadget_value(uint32_t level, uint32_t base_log, uint64_t q)

lux_gpu_ringtail_combine_batch
LuxError lux_gpu_ringtail_combine_batch(LuxGPU *gpu, const uint8_t *const *partial_sigs, const int32_t *lagrange_coeffs, uint8_t **combined_sigs, size_t threshold, size_t count)

lux_fhe_signed_decomp_digit
int64_t lux_fhe_signed_decomp_digit(uint64_t value, uint32_t level, uint32_t base_log)

lux_blind_rotate
LuxError lux_blind_rotate(LuxGPU *gpu, uint64_t *acc, const uint64_t *bsk, const uint64_t *lwe_a, uint32_t n_lwe, uint32_t N, uint32_t k, uint32_t l, uint32_t base_log, uint64_t q)

lux_gpu_create_with_device
LuxGPU * lux_gpu_create_with_device(LuxBackend backend, int device_index)

lux_fhe_decode_phase
uint64_t lux_fhe_decode_phase(uint64_t phase, uint64_t modulus, uint64_t q)

lux_gpu_ringtail_partial_sign_batch
LuxError lux_gpu_ringtail_partial_sign_batch(LuxGPU *gpu, const uint8_t *const *shares, const uint8_t *const *messages, uint8_t **partial_sigs, size_t count)

lux_bls12_381_mul
LuxError lux_bls12_381_mul(LuxGPU *gpu, const void *points, const void *scalars, void *out, size_t count, bool is_g2)

lux_stream_create
LuxStream * lux_stream_create(LuxGPU *gpu)

lux_bls12_381_pairing
LuxError lux_bls12_381_pairing(LuxGPU *gpu, const void *g1_points, const void *g2_points, void *out, size_t count)

lux_fhe_is_valid_N
bool lux_fhe_is_valid_N(uint32_t N)

lux_poly_mul
LuxError lux_poly_mul(LuxGPU *gpu, const uint64_t *a, const uint64_t *b, uint64_t *result, size_t n, uint64_t modulus)

lux_tensor_sub
LuxTensor * lux_tensor_sub(LuxGPU *gpu, LuxTensor *a, LuxTensor *b)

lux_tensor_tanh
LuxTensor * lux_tensor_tanh(LuxGPU *gpu, LuxTensor *t)

lux_tensor_abs
LuxTensor * lux_tensor_abs(LuxGPU *gpu, LuxTensor *t)

lux_tensor_full
LuxTensor * lux_tensor_full(LuxGPU *gpu, const int64_t *shape, int ndim, LuxDtype dtype, double value)

lux_tfhe_bootstrap
LuxError lux_tfhe_bootstrap(LuxGPU *gpu, const uint64_t *lwe_in, uint64_t *lwe_out, const uint64_t *bsk, const uint64_t *test_poly, uint32_t n_lwe, uint32_t N, uint32_t k, uint32_t l, uint32_t base_log, uint64_t q)

lux_tensor_ndim
int lux_tensor_ndim(LuxTensor *tensor)

LuxEvent
struct LuxEvent LuxEvent
Definition gpu.h:95

lux_backend_available
bool lux_backend_available(LuxBackend backend)

lux_tensor_max
LuxTensor * lux_tensor_max(LuxGPU *gpu, LuxTensor *t, const int *axes, int naxes)

LuxDeviceInfo
Definition gpu.h:101

LuxDeviceInfo::max_workgroup_size
int max_workgroup_size
Definition gpu.h:111

LuxDeviceInfo::memory_total
uint64_t memory_total
Definition gpu.h:106

LuxDeviceInfo::is_discrete
bool is_discrete
Definition gpu.h:108

LuxDeviceInfo::vendor
const char * vendor
Definition gpu.h:105

LuxDeviceInfo::is_unified_memory
bool is_unified_memory
Definition gpu.h:109

LuxDeviceInfo::backend
LuxBackend backend
Definition gpu.h:102

LuxDeviceInfo::name
const char * name
Definition gpu.h:104

LuxDeviceInfo::memory_available
uint64_t memory_available
Definition gpu.h:107

LuxDeviceInfo::compute_units
int compute_units
Definition gpu.h:110

LuxDeviceInfo::index
int index
Definition gpu.h:103

LuxEcrecoverInput
Definition gpu.h:272

LuxEcrecoverInput::v
uint8_t v
Definition gpu.h:275

LuxEcrecoverOutput
Definition gpu.h:282

LuxEcrecoverOutput::valid
uint8_t valid
Definition gpu.h:284

LuxFr256
Definition gpu.h:602