Comparing sensitive data, confidential files or internal emails?

Most legal and privacy policies prohibit uploading sensitive data online. Diffchecker Desktop ensures your confidential information never leaves your computer. Work offline and compare documents securely.

What changes here?

Created Diff never expires
20 removals
538 lines
13 additions
535 lines
/*
/*
Version 2 of the code, removing bias from forward pass
Version 2 of the code, removing bias from forward pass
*/
*/


#include <unistd.h>
#include <unistd.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#include <stdarg.h>
#include <stdarg.h>
#include <string>
#include <string>
#include <string_view>
#include <string_view>
#include <sys/stat.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/types.h>
// ----------- CPU utilities -----------
// ----------- CPU utilities -----------
// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
// defines: create_dir_if_not_exists, find_max_step, ends_with_bin
// defines: create_dir_if_not_exists, find_max_step, ends_with_bin
#include "llmc/utils.h"
#include "llmc/utils.h"
// defines: tokenizer_init, tokenizer_decode, tokenizer_free
// defines: tokenizer_init, tokenizer_decode, tokenizer_free
#include "llmc/tokenizer.h"
#include "llmc/tokenizer.h"
// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
// defines: evalloader_init, evalloader_reset, evalloader_next_batch, evalloader_free
// defines: evalloader_init, evalloader_reset, evalloader_next_batch, evalloader_free
#include "llmc/dataloader.h"
#include "llmc/dataloader.h"
// defines: manual_seed, normal_ (same as torch.manual_seed and torch.normal)
// defines: manual_seed, normal_ (same as torch.manual_seed and torch.normal)
#include "llmc/rand.h"
#include "llmc/rand.h"
// defines: lr_scheduler_init, get_learning_rate
// defines: lr_scheduler_init, get_learning_rate
#include "llmc/schedulers.h"
#include "llmc/schedulers.h"
// defines: sample_softmax, random_f32
// defines: sample_softmax, random_f32
#include "llmc/sampler.h"
#include "llmc/sampler.h"
// defines: logger_init, logger_log_eval, logger_log_val, logger_log_train
// defines: logger_init, logger_log_eval, logger_log_val, logger_log_train
#include "llmc/logger.h"
#include "llmc/logger.h"
// defines: get_flops_promised
// defines: get_flops_promised
#include "llmc/mfu.h"
#include "llmc/mfu.h"
// defines: OutlierDetector, init_detector, update_detector
// defines: OutlierDetector, init_detector, update_detector
#include "llmc/outlier_detector.h"
#include "llmc/outlier_detector.h"
// ----------- GPU utilities -----------
// ----------- GPU utilities -----------
// defines:
// defines:
// WARP_SIZE, MAX_1024_THREADS_BLOCKS, CEIL_DIV, cudaCheck, PRECISION_MODE
// WARP_SIZE, MAX_1024_THREADS_BLOCKS, CEIL_DIV, cudaCheck, PRECISION_MODE
// NVTX_RANGE_FN
// NVTX_RANGE_FN
#include "llmc/cuda_common.h"
#include "llmc/cuda_common.h"
// defines:
// defines:
// Packed128, f128, x128
// Packed128, f128, x128
// warpReduceSum, warpReduceMax, blockReduce, copy_and_cast_kernel, cudaMallocConditionallyManaged
// warpReduceSum, warpReduceMax, blockReduce, copy_and_cast_kernel, cudaMallocConditionallyManaged
#include "llmc/cuda_utils.cuh"
#include "llmc/cuda_utils.cuh"
// defines: CUBLAS_LOWP, cublasCheck, cublaslt_workspace_size, cublaslt_workspace
// defines: CUBLAS_LOWP, cublasCheck, cublaslt_workspace_size, cublaslt_workspace
// defines: cublas_compute, cublaslt_handle, cublas_handle
// defines: cublas_compute, cublaslt_handle, cublas_handle
#include "llmc/cublas_common.h"
#include "llmc/cublas_common.h"
// ----------- Layer implementations in CUDA -----------
// ----------- Layer implementations in CUDA -----------
// defines: encoder_forward, encoder_backward
// defines: encoder_forward, encoder_backward
#include "llmc/encoder.cuh"
#include "llmc/encoder.cuh"
// defines: layernorm_forward, residual_forward, fused_residual_forward5, layernorm_backward
// defines: layernorm_forward, residual_forward, fused_residual_forward5, layernorm_backward
#include "llmc/layernorm.cuh"
#include "llmc/layernorm.cuh"
// defines: matmul_cublaslt, matmul_forward, matmul_backward, gelu_forward, gelu_backward_inplace
// defines: matmul_cublaslt, matmul_forward, matmul_backward, gelu_forward, gelu_backward_inplace
#include "llmc/matmul.cuh"
#include "llmc/matmul.cuh"
#ifdef ENABLE_CUDNN
#ifdef ENABLE_CUDNN
// defines: create_cudnn, destroy_cudnn, attention_forward_cudnn, attention_backward_cudnn
// defines: create_cudnn, destroy_cudnn, attention_forward_cudnn, attention_backward_cudnn
#include "llmc/cudnn_att.h"
#include "llmc/cudnn_att.h"
#else
#else
// defines: attention_forward, attention_backward
// defines: attention_forward, attention_backward
#include "llmc/attention.cuh"
#include "llmc/attention.cuh"
#endif
#endif
// defines: fused_classifier
// defines: fused_classifier
#include "llmc/fused_classifier.cuh"
#include "llmc/fused_classifier.cuh"
// defines: adamw_kernel3
// defines: adamw_kernel3
#include "llmc/adamw.cuh"
#include "llmc/adamw.cuh"
// defines: global_norm_squared
// defines: global_norm_squared
#include "llmc/global_norm.cuh"
#include "llmc/global_norm.cuh"
// ----------- Multi-GPU support -----------
// ----------- Multi-GPU support -----------
// defines: ncclFloatX, ncclCheck, MultiGpuConfig, ShardInfo
// defines: ncclFloatX, ncclCheck, MultiGpuConfig, ShardInfo
// defines: printf0, multi_gpu_config
// defines: printf0, multi_gpu_config
// defines: multi_gpu_config_init, multi_gpu_config_free
// defines: multi_gpu_config_init, multi_gpu_config_free
// defines: set_zero_configs, multi_gpu_cpu_float_sum, multi_gpu_barrier
// defines: set_zero_configs, multi_gpu_cpu_float_sum, multi_gpu_barrier
// defines: multi_gpu_get_shard_offset, multi_gpu_async_reduce_gradient
// defines: multi_gpu_get_shard_offset, multi_gpu_async_reduce_gradient
#include "llmc/zero.cuh"
#include "llmc/zero.cuh"


// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
// global vars for I/O
// global vars for I/O
char filename_buffer[512];
char filename_buffer[512];


// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
// global vars containing information about the GPU this process is running on
// global vars containing information about the GPU this process is running on
cudaDeviceProp deviceProp; // fills in common_start()
cudaDeviceProp deviceProp; // fills in common_start()
cudaStream_t main_stream;
cudaStream_t main_stream;
// buffer size to use for device <-> disk io
// buffer size to use for device <-> disk io
constexpr const size_t IO_BUF_SIZE = 32 * 1024 * 1024;
constexpr const size_t IO_BUF_SIZE = 32 * 1024 * 1024;


// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
// GPT-2 model definition
// GPT-2 model definition


typedef struct {
typedef struct {
int max_seq_len; // max sequence length, e.g. 1024
int max_seq_len; // max sequence length, e.g. 1024
int vocab_size; // vocab size, e.g. 50257
int vocab_size; // vocab size, e.g. 50257
int padded_vocab_size; // padded to e.g. %128==0, 50304
int padded_vocab_size; // padded to e.g. %128==0, 50304
int num_layers; // number of layers, e.g. 12
int num_layers; // number of layers, e.g. 12
int num_heads; // number of heads in attention, e.g. 12
int num_heads; // number of heads in attention, e.g. 12
int channels; // number of channels, e.g. 768
int channels; // number of channels, e.g. 768
} GPTConfig;
} GPTConfig;


// the parameters of the model
// the parameters of the model
constexpr const int NUM_PARAMETER_TENSORS = 16;
constexpr const int NUM_PARAMETER_TENSORS = 12;
typedef struct {
typedef struct {
floatX* wte; // (V, C)
floatX* wte; // (V, C)
floatX* wpe; // (maxT, C)
floatX* wpe; // (maxT, C)
floatX* ln1w; // (L, C)
floatX* ln1w; // (L, C)
floatX* ln1b; // (L, C)
floatX* ln1b; // (L, C)
floatX* qkvw; // (L, 3*C, C)
floatX* qkvw; // (L, 3*C, C)
floatX* qkvb; // (L, 3*C)
floatX* attprojw; // (L, C, C)
floatX* attprojw; // (L, C, C)
floatX* attprojb; // (L, C)
floatX* ln2w; // (L, C)
floatX* ln2w; // (L, C)
floatX* ln2b; // (L, C)
floatX* ln2b; // (L, C)
floatX* fcw; // (L, 4*C, C)
floatX* fcw; // (L, 4*C, C)
floatX* fcb; // (L, 4*C)
floatX* fcprojw; // (L, C, 4*C)
floatX* fcprojw; // (L, C, 4*C)
floatX* fcprojb; // (L, C)
floatX* lnfw; // (C)
floatX* lnfw; // (C)
floatX* lnfb; // (C)
floatX* lnfb; // (C)
} ParameterTensors;
} ParameterTensors;
static_assert(sizeof(ParameterTensors) == NUM_PARAMETER_TENSORS * sizeof(void*), "Inconsistent sizes!");
static_assert(sizeof(ParameterTensors) == NUM_PARAMETER_TENSORS * sizeof(void*), "Inconsistent sizes!");


void fill_in_parameter_sizes(size_t* param_sizes, size_t* param_sizeof, GPTConfig config) {
void fill_in_parameter_sizes(size_t* param_sizes, size_t* param_sizeof, GPTConfig config) {
size_t Vp = config.padded_vocab_size;
size_t Vp = config.padded_vocab_size;
size_t C = config.channels;
size_t C = config.channels;
size_t maxT = config.max_seq_len;
size_t maxT = config.max_seq_len;
size_t L = config.num_layers;
size_t L = config.num_layers;
param_sizes[0] = Vp * C; // wte
param_sizes[0] = Vp * C; // wte
param_sizes[1] = maxT * C; // wpe
param_sizes[1] = maxT * C; // wpe
param_sizes[2] = L * C; // ln1w
param_sizes[2] = L * C; // ln1w
param_sizes[3] = L * C; // ln1b
param_sizes[3] = L * C; // ln1b
param_sizes[4] = L * (3 * C) * C; // qkvw
param_sizes[4] = L * (3 * C) * C; // qkvw
param_sizes[5] = 16; // qkvb
param_sizes[5] = L * C * C; // attprojw
param_sizes[6] = L * C * C; // attprojw
param_sizes[6] = L * C; // ln2w
param_sizes[7] = 16; // attprojb
param_sizes[7] = L * C; // ln2b
param_sizes[8] = L * C; // ln2w
param_sizes[8] = L * (4 * C) * C; // fcw
param_sizes[9] = L * C; // ln2b
param_sizes[9] = L * C * (4 * C); // fcprojw
param_sizes[10] = L * (4 * C) * C; // fcw
param_sizes[10] = C; // lnfw
param_sizes[11] = 16; // fcb
param_sizes[11] = C; // lnfb
param_sizes[12] = L * C * (4 * C); // fcprojw
param_sizes[13] = 16; // fcprojb
param_sizes[14] = C; // lnfw
param_sizes[15] = C; // lnfb


// populate the parameter sizes in bytes (all the same for now, keeping for future use)
// populate the parameter sizes in bytes (all the same for now, keeping for future use)
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
param_sizeof[i] = sizeof(floatX);
param_sizeof[i] = sizeof(floatX);
}
}
}
}


// allocate memory for the parameters and point the individual tensors to the right places
// allocate memory for the parameters and point the individual tensors to the right places
void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elements, size_t *param_sizeof) {
void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elements, size_t *param_sizeof) {
// calculate the total number of parameters and bytes across all tensors
// calculate the total number of parameters and bytes across all tensors
size_t num_parameters_bytes = 0;
size_t num_parameters_bytes = 0;
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
num_parameters_bytes += param_elements[i] * param_sizeof[i];
num_parameters_bytes += param_elements[i] * param_sizeof[i];
}
}
// malloc all parameters all at once on the device
// malloc all parameters all at once on the device
void* params_memory;
void* params_memory;
cudaCheck(cudaMalloc((void**)&params_memory, num_parameters_bytes));
cudaCheck(cudaMalloc((void**)&params_memory, num_parameters_bytes));
// assign all the tensors their place in the array
// assign all the tensors their place in the array
floatX** ptrs[] = {
floatX** ptrs[] = {
&params->wte, &params->wpe, &params->ln1w, &params->ln1b, &params->qkvw, &params->qkvb,
&params->wte, &params->wpe, &params->ln1w, &params->ln1b, &params->qkvw,
&params->attprojw, &params->attprojb, &params->ln2w, &params->ln2b, &params->fcw, &params->fcb,
&params->attprojw, &params->ln2w, &params->ln2b, &params->fcw,
&params->fcprojw, &params->fcprojb, &params->lnfw, &params->lnfb
&params->fcprojw, &params->lnfw, &params->lnfb
};
};
char* params_memory_iterator = (char*)params_memory;
char* params_memory_iterator = (char*)params_memory;
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
*(ptrs[i]) = (floatX*)params_memory_iterator;
*(ptrs[i]) = (floatX*)params_memory_iterator;
params_memory_iterator += param_elements[i] * param_sizeof[i];
params_memory_iterator += param_elements[i] * param_sizeof[i];
}
}
return params_memory;
return params_memory;
}
}


constexpr int NUM_ACTIVATION_TENSORS = 21;
constexpr int NUM_ACTIVATION_TENSORS = 21;
typedef struct {
typedef struct {
floatX* encoded; // (B, T, C)
floatX* encoded; // (B, T, C)
floatX* ln1; // (L, B, T, C)
floatX* ln1; // (L, B, T, C)
float* ln1_mean; // (L, B, T)
float* ln1_mean; // (L, B, T)
float* ln1_rstd; // (L, B, T)
float* ln1_rstd; // (L, B, T)
floatX* atty; // (L, B, T, C)
floatX* atty; // (L, B, T, C)
// cuDNN saves only some statistics information
// cuDNN saves only some statistics information
#if ENABLE_CUDNN
#if ENABLE_CUDNN
float* att; // (L, B, NH, T)
float* att; // (L, B, NH, T)
#else
#else
floatX* att; // (L, B, NH, T, T)
floatX* att; // (L, B, NH, T, T)
#endif
#endif


floatX* residual2; // (L, B, T, C)
floatX* residual2; // (L, B, T, C)
floatX* ln2; // (L, B, T, C)
floatX* ln2; // (L, B, T, C)
float* ln2_mean; // (L, B, T)
float* ln2_mean; // (L, B, T)
float* ln2_rstd; // (L, B, T)
float* ln2_rstd; // (L, B, T)
floatX* fch; // (L, B, T, 4*C)
floatX* fch; // (L, B, T, 4*C)
floatX* fch_gelu; // (L, B, T, 4*C)
floatX* fch_gelu; // (L, B, T, 4*C)
floatX* residual3; // (L, B, T, C)
floatX* residual3; // (L, B, T, C)
floatX* lnf; // (B, T, C); if LN recomputation is enabled (-r 2 and above), will be used for _all_ layernorms
floatX* lnf; // (B, T, C); if LN recomputation is enabled (-r 2 and above), will be used for _all_ layernorms
float* lnf_mean; // (B, T)
float* lnf_mean; // (B, T)
float* lnf_rstd; // (B, T)
float* lnf_rstd; // (B, T)
float* losses; // (B, T), will be accumulated in micro-steps
float* losses; // (B, T), will be accumulated in micro-steps
// adding these two compared to the CPU .c code, needed for attention kernel as buffers
// adding these two compared to the CPU .c code, needed for attention kernel as buffers
floatX* qkvr; // (L, B, T, 3*C)
floatX* qkvr; // (L, B, T, 3*C)
// in inference mode, this buffer will store the logits
// in inference mode, this buffer will store the logits
// in training mode, this buffer will contain the *gradients* of the logits.
// in training mode, this buffer will contain the *gradients* of the logits.
// during the processing of transformer blocks, we will also use this as a
// during the processing of transformer blocks, we will also use this as a
// general scratchpad buffer. Allocation is made large enough to hold (B, T, 3C),
// general scratchpad buffer. Allocation is made large enough to hold (B, T, 3C),
// (B, NH, T, T), and (B, T, V) shaped tensors.
// (B, NH, T, T), and (B, T, V) shaped tensors.
floatX* output;
floatX* output;


// some additional scratch buffers
// some additional scratch buffers
floatX* scratch_bt4c; // (B, T, 4*C)
floatX* scratch_bt4c; // (B, T, 4*C)
floatX* scratch_btc; // (B, T, C)
floatX* scratch_btc; // (B, T, C)
} ActivationTensors;
} ActivationTensors;




struct TensorSpec {
struct TensorSpec {
void** ptr;
void** ptr;
size_t size;
size_t size;
DType type;
DType type;
};
};




#define TENSOR_SPEC(pointer, size) TensorSpec{(void**)(&pointer), (size), dtype_of(pointer)};
#define TENSOR_SPEC(pointer, size) TensorSpec{(void**)(&pointer), (size), dtype_of(pointer)};


void fill_in_activation_sizes(const ActivationTensors* data, TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS], size_t B, size_t T, GPTConfig config, int recompute) {
void fill_in_activation_sizes(const ActivationTensors* data, TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS], size_t B, size_t T, GPTConfig config, int recompute) {
size_t Vp = config.padded_vocab_size;
size_t Vp = config.padded_vocab_size;
size_t L = config.num_layers;
size_t L = config.num_layers;
size_t NH = config.num_heads;
size_t NH = config.num_heads;
size_t C = config.channels;
size_t C = config.channels;
tensors[0] = TENSOR_SPEC(data->encoded, B * T * C);
tensors[0] = TENSOR_SPEC(data->encoded, B * T * C);
// if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
// if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
tensors[1] = TENSOR_SPEC(data->ln1, (recompute < 2) ? L * B * T * C : 0);
tensors[1] = TENSOR_SPEC(data->ln1, (recompute < 2) ? L * B * T * C : 0);
tensors[2] = TENSOR_SPEC(data->ln1_mean, L * B * T);
tensors[2] = TENSOR_SPEC(data->ln1_mean, L * B * T);
tensors[3] = TENSOR_SPEC(data->ln1_rstd, L * B * T);
tensors[3] = TENSOR_SPEC(data->ln1_rstd, L * B * T);
tensors[4] = TENSOR_SPEC(data->atty, L * B * T * C);
tensors[4] = TENSOR_SPEC(data->atty, L * B * T * C);
#ifdef ENABLE_CUDNN
#ifdef ENABLE_CUDNN
// FP32 stats tensor for cuDNN to be passed to backward pass
// FP32 stats tensor for cuDNN to be passed to backward pass
tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T);
tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T);
#else
#else
tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T * T);
tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T * T);
#endif
#endif
tensors[6] = TENSOR_SPEC(data->residual2, L * B * T * C);
tensors[6] = TENSOR_SPEC(data->residual2, L * B * T * C);
// if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
// if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
tensors[7] = TENSOR_SPEC(data->ln2, (recompute < 2) ? L * B * T * C : 0);
tensors[7] = TENSOR_SPEC(data->ln2, (recompute < 2) ? L * B * T * C : 0);
tensors[8] = TENSOR_SPEC(data->ln2_mean, L * B * T);
tensors[8] = TENSOR_SPEC(data->ln2_mean, L * B * T);
tensors[9] = TENSOR_SPEC(data->ln2_rstd, L * B * T);
tensors[9] = TENSOR_SPEC(data->ln2_rstd, L * B * T);
tensors[10] = TENSOR_SPEC(data->fch, L * B * T * 4*C);
tensors[10] = TENSOR_SPEC(data->fch, L * B * T * 4*C);
// if recompute >= 1 then we will recompute gelu_forward during backward and use this as scratch buffer
// if recompute >= 1 then we will recompute gelu_forward during backward and use this as scratch buffer
tensors[11] = TENSOR_SPEC(data->fch_gelu, (recompute < 1) ? L * B * T * 4*C : B * T * 4*C);
tensors[11] = TENSOR_SPEC(data->fch_gelu, (recompute < 1) ? L * B * T * 4*C : B * T * 4*C);
tensors[12] = TENSOR_SPEC(data->residual3, L * B * T * C);
tensors[12] = TENSOR_SPEC(data->residual3, L * B * T * C);
tensors[13] = TENSOR_SPEC(data->lnf, B * T * C);
tensors[13] = TENSOR_SPEC(data->lnf, B * T * C);
tensors[14] = TENSOR_SPEC(data->lnf_mean, B * T);
tensors[14] = TENSOR_SPEC(data->lnf_mean, B * T);
tensors[15] = TENSOR_SPEC(data->lnf_rstd, B * T);
tensors[15] = TENSOR_SPEC(data->lnf_rstd, B * T);
tensors[16] = TENSOR_SPEC(data->losses, B * T);
tensors[16] = TENSOR_SPEC(data->losses, B * T);
tensors[17] = TENSOR_SPEC(data->qkvr, L * B * T * 3*C);
tensors[17] = TENSOR_SPEC(data->qkvr, L * B * T * 3*C);
tensors[18] = TENSOR_SPEC(data->output, B * T * max(3*C, max(NH*T, Vp)));
tensors[18] = TENSOR_SPEC(data->output, B * T * max(3*C, max(NH*T, Vp)));


tensors[19] = TENSOR_SPEC(data->scratch_bt4c, B * T * 4 * C);
tensors[19] = TENSOR_SPEC(data->scratch_bt4c, B * T * 4 * C);
tensors[20] = TENSOR_SPEC(data->scratch_btc, B * T * C);
tensors[20] = TENSOR_SPEC(data->scratch_btc, B * T * C);
}
}


void* malloc_and_point_activations(TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS]) {
void* malloc_and_point_activations(TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS]) {
size_t bytes = 0;
size_t bytes = 0;
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
bytes += tensors[i].size * sizeof_dtype(tensors[i].type);
bytes += tensors[i].size * sizeof_dtype(tensors[i].type);
}
}


printf0("allocating %d MiB for activations\n", (int)round(bytes / (1024 * 1024)));
printf0("allocating %d MiB for activations\n", (int)round(bytes / (1024 * 1024)));


void* acts_memory;
void* acts_memory;
cudaCheck(cudaMalloc((void**)&acts_memory, bytes));
cudaCheck(cudaMalloc((void**)&acts_memory, bytes));


// cudaMalloc does not guarantee initial memory values so we memset the allocation here
// cudaMalloc does not guarantee initial memory values so we memset the allocation here
// this matters because e.g. non-cuDNN attention assumes the attention buffer is zeroed
// this matters because e.g. non-cuDNN attention assumes the attention buffer is zeroed
// todo - up to ~100ms on slow GPUs, could theoretically be more selective, but this is safer
// todo - up to ~100ms on slow GPUs, could theoretically be more selective, but this is safer
cudaCheck(cudaMemset(acts_memory, 0, bytes));
cudaCheck(cudaMemset(acts_memory, 0, bytes));


char* acts_memory_iterator = (char*)acts_memory;
char* acts_memory_iterator = (char*)acts_memory;
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
// extra protection so we don't accidentally use an empty buffer
// extra protection so we don't accidentally use an empty buffer
if(tensors[i].size == 0) {
if(tensors[i].size == 0) {
*(tensors[i].ptr) = NULL;
*(tensors[i].ptr) = NULL;
}else {
}else {
*(tensors[i].ptr) = acts_memory_iterator;
*(tensors[i].ptr) = acts_memory_iterator;
acts_memory_iterator += tensors[i].size * sizeof_dtype(tensors[i].type);
acts_memory_iterator += tensors[i].size * sizeof_dtype(tensors[i].type);
}
}
}
}
return acts_memory;
return acts_memory;
}
}


typedef struct {
typedef struct {
GPTConfig config;
GPTConfig config;
// the weights of the model, and their sizes
// the weights of the model, and their sizes
ParameterTensors params;
ParameterTensors params;
size_t param_elements[NUM_PARAMETER_TENSORS];
size_t param_elements[NUM_PARAMETER_TENSORS];
size_t param_sizeof[NUM_PARAMETER_TENSORS];
size_t param_sizeof[NUM_PARAMETER_TENSORS];
void* params_memory;
void* params_memory;
size_t num_parameters;
size_t num_parameters;
size_t num_parameters_bytes;
size_t num_parameters_bytes;
// gradients of the weights
// gradients of the weights
ParameterTensors grads;
ParameterTensors grads;
void* grads_memory;
void* grads_memory;
// buffers for the AdamW optimizer
// buffers for the AdamW optimizer
float* m_memory;
float* m_memory;
float* v_memory;
float* v_memory;
float* master_weights; // is NULL unless fp32 weights is enabled.
float* master_weights; // is NULL unless fp32 weights is enabled.
// the activations of the model, and their sizes
// the activations of the model, and their sizes
ActivationTensors acts;
ActivationTensors acts;
TensorSpec acts_specs[NUM_ACTIVATION_TENSORS];
TensorSpec acts_specs[NUM_ACTIVATION_TENSORS];
void* acts_memory;
void* acts_memory;
// other run state configuration
// other run state configuration
int batch_size; // the batch size (B) of current forward pass
int batch_size; // the batch size (B) of current forward pass
int seq_len; // the sequence length (T) of current forward pass
int seq_len; // the sequence length (T) of current forward pass
int* inputs; // the input tokens for the current forward pass
int* inputs; // the input tokens for the current forward pass
int* targets; // the target tokens for the current forward pass
int* targets; // the target tokens for the current forward pass
float mean_loss; // after the last backward micro-batch, will be populated with mean loss across all GPUs and micro-steps
float mean_loss; // after the last backward micro-batch, will be populated with mean loss across all GPUs and micro-steps
float* accumulated_mean_loss; // GPU buffer used to accumulate loss across micro-steps
float* accumulated_mean_loss; // GPU buffer used to accumulate loss across micro-steps
float* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
float* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
unsigned long long rng_state_last_update; // RNG before last GPT_update() to re-round identically from master weights
unsigned long long rng_state_last_update; // RNG before last GPT_update() to re-round identically from master weights
int use_master_weights; // keep master weights copy in float for optim update? 0|1
int use_master_weights; // keep master weights copy in float for optim update? 0|1
bool init_state; // set to true if master weights need to be initialized
bool init_state; // set to true if master weights need to be initialized
int gelu_fusion; // fuse gelu via cuBLASLt (0=none, 1=forward, 2=forward+backward)
int gelu_fusion; // fuse gelu via cuBLASLt (0=none, 1=forward, 2=forward+backward)
int recompute; // recompute gelu | layernorm forward during model backward? 0|1|2
int recompute; // recompute gelu | layernorm forward during model backward? 0|1|2
// todo - if other functions need cpu scratch buffers in the future, reuse as generic scratch?
// todo - if other functions need cpu scratch buffers in the future, reuse as generic scratch?
int* workload_indices; // encoder_backward, B*T*num_c_groups (int)
int* workload_indices; // encoder_backward, B*T*num_c_groups (int)
int4* bucket_info; // encoder_backward, B*T*num_c_groups (int4) - size for worst case
int4* bucket_info; // encoder_backward, B*T*num_c_groups (int4) - size for worst case
} GPT;
} GPT;


void GPT_init_settings(GPT *model) {
void GPT_init_settings(GPT *model) {
// common inits outside of the model weights
// common inits outside of the model weights
// memory lazily initialized in forward()
// memory lazily initialized in forward()
model->acts_memory = NULL;
model->acts_memory = NULL;
model->inputs = NULL;
model->inputs = NULL;
model->targets = NULL;
model->targets = NULL;
model->accumulated_mean_loss = NULL;
model->accumulated_mean_loss = NULL;
model->cpu_losses = NULL;
model->cpu_losses = NULL;
// the B,T params are determined and set, fixed on first batch in forward()
// the B,T params are determined and set, fixed on first batch in forward()
model->batch_size = 0;
model->batch_size = 0;
model->seq_len = 0;
model->seq_len = 0;
model->mean_loss = -1.0f; // -1.0f designates no loss, set at end of forward()
model->mean_loss = -1.0f; // -1.0f designates no loss, set at end of forward()
model->params_memory = NULL;
model->params_memory = NULL;
// memory lazily initialized in backward()
// memory lazily initialized in backward()
model->grads_memory = NULL;
model->grads_memory = NULL;
model->workload_indices = NULL; // on cpu, for encoder_backward
model->workload_indices = NULL; // on cpu, for encoder_backward
model->bucket_info = NULL; // on cpu, for encoder_backward
model->bucket_info = NULL; // on cpu, for encoder_backward
// memory lazily initialized in update()
// memory lazily initialized in update()
model->m_memory = NULL;
model->m_memory = NULL;
model->v_memory = NULL;
model->v_memory = NULL;
model->master_weights = NULL;
model->master_weights = NULL;
// other default settings
// other default settings
model->rng_state = 13371337 + multi_gpu_config.process_rank; // used in stochastic rounding
model->rng_state = 13371337 + multi_gpu_config.process_rank; // used in stochastic rounding
model->use_master_weights = 1; // safe default: do keep master weights in fp32
model->use_master_weights = 1; // safe default: do keep master weights in fp32
model->init_state = true;
model->init_state = true;
model->recompute = 1; // good default: recompute gelu but not layernorm
model->recompute = 1; // good default: recompute gelu but not layernorm
model->gelu_fusion = 0; //deviceProp.major >= 9 ? 2 : 0; // default: off for now (default must match main())
model->gelu_fusion = 0; //deviceProp.major >= 9 ? 2 : 0; // default: off for now (default must match main())
}
}


void GPT_allocate_weights(GPT *model) {
void GPT_allocate_weights(GPT *model) {
// fill in all the parameter tensor dimensions and types
// fill in all the parameter tensor dimensions and types
fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config);
fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config);
model->num_parameters = 0;
model->num_parameters = 0;
model->num_parameters_bytes = 0;
model->num_parameters_bytes = 0;
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
model->num_parameters += model->param_elements[i];
model->num_parameters += model->param_elements[i];
model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
}
}
// create memory for model parameters on the device
// create memory for model parameters on the device
assert(model->params_memory == nullptr);
assert(model->params_memory == nullptr);
model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
}
}


void GPT_allocate_state(GPT *model, int B, int T) {
void GPT_allocate_state(GPT *model, int B, int T) {
printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
assert(model->grads_memory == nullptr);
assert(model->grads_memory == nullptr);
model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof);
model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof);


// record the current B,T as well
// record the current B,T as well
model->batch_size = B;
model->batch_size = B;
model->seq_len = T;
model->seq_len = T;


// allocate the space
// allocate the space
fill_in_activation_sizes(&model->acts, model->acts_specs, B, T, model->config, model->recompute);
fill_in_activation_sizes(&model->acts, model->acts_specs, B, T, model->config, model->recompute);
model->acts_memory = malloc_and_point_activations(model->acts_specs);
model->acts_memory = malloc_and_point_activations(model->acts_specs);
// also create memory for caching inputs and targets
// also create memory for caching inputs and targets
cudaCheck(cudaMalloc((void**)&model->inputs, B * T * sizeof(int)));
cudaCheck(cudaMalloc((void**)&model->inputs, B * T * sizeof(int)));
cudaCheck(cudaMalloc((void**)&model->targets, B * T * sizeof(int)));
cudaCheck(cudaMalloc((void**)&model->targets, B * T * sizeof(int)));
cudaCheck(cudaMalloc(((void**)&model->accumulated_mean_loss), sizeof(float)));
cudaCheck(cudaMalloc(((void**)&model->accumulated_mean_loss), sizeof(float)));
cudaCheck(cudaMallocHost((void**)&model->cpu_losses, B * T * sizeof(float)));
cudaCheck(cudaMallocHost((void**)&model->cpu_losses, B * T * sizeof(float)));


// initialise cpu scratch buffers for encoder backward
// initialise cpu scratch buffers for encoder backward
size_t num_c_groups = CEIL_DIV(model->config.channels, (WARP_SIZE * x128::size));
size_t num_c_groups = CEIL_DIV(model->config.channels, (WARP_SIZE * x128::size));
assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL<<31ULL)); // todo - maybe an issue for llama3-400B(?)
assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL<<31ULL)); // todo - maybe an issue for llama3-400B(?)
model->workload_indices = (int*)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups);
model->workload_indices = (int*)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups);
model->bucket_info = (int4*)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups);
model->bucket_info = (int4*)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups);


// cudaMallocConditionallyManaged can fall back to cudaMallocManaged if not enough memory on device
// cudaMallocConditionallyManaged can fall back to cudaMallocManaged if not enough memory on device
// and returns a status code of 1 if it had to fall back, in that case we want to print warning.
// and returns a status code of 1 if it had to fall back, in that case we want to print warning.
int memory_status = 0;
int memory_status = 0;


// we will now init the optimizer states and master weights
// we will now init the optimizer states and master weights
// this is usually a substantial amount of memory allocation right here.
// this is usually a substantial amount of memory allocation right here.
size_t shard_num_parameters = multi_gpu_config.shard_num_parameters; // num parameters we are responsible for
size_t shard_num_parameters = multi_gpu_config.shard_num_parameters; // num parameters we are responsible for
printf0("allocating %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20);
printf0("allocating %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20);
printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
assert(model->m_memory == nullptr);
assert(model->m_memory == nullptr);
assert(model->v_memory == nullptr);
assert(model->v_memory == nullptr);
memory_status |= cudaMallocConditionallyManaged((void**)&model->m_memory, shard_num_parameters * sizeof(float));
memory_status |= cudaMallocConditionallyManaged((void**)&model->m_memory, shard_num_parameters * sizeof(float));
memory_status |= cudaMallocConditionallyManaged((void**)&model->v_memory, shard_num_parameters * sizeof(float));
memory_status |= cudaMallocConditionallyManaged((void**)&model->v_memory, shard_num_parameters * sizeof(float));


if (model->use_master_weights == 1) {
if (model->use_master_weights == 1) {
assert(model->master_weights == nullptr);
assert(model->master_weights == nullptr);
printf0("allocating %zu MiB for master copy of params\n", (shard_num_parameters * sizeof(float)) >> 20);
printf0("allocating %zu MiB for master copy of params\n", (shard_num_parameters * sizeof(float)) >> 20);
memory_status |= cudaMallocConditionallyManaged((void**) &model->master_weights, shard_num_parameters * sizeof(float));
memory_status |= cudaMallocConditionallyManaged((void**) &model->master_weights, shard_num_parameters * sizeof(float));
}
}


// report on mixed memory allocation status (re-using our float reduce function, bit awk ok)
// report on mixed memory allocation status (re-using our float reduce function, bit awk ok)
int reduced_memory_status = (int) multi_gpu_cpu_float_sum((float)memory_status, &multi_gpu_config);
int reduced_memory_status = (int) multi_gpu_cpu_float_sum((float)memory_status, &multi_gpu_config);
if (reduced_memory_status >= 1) {
if (reduced_memory_status >= 1) {
printf0("WARNING: Fell back to cudaMallocManaged when initializing m,v,master_weights on %d GPUs\n", reduced_memory_status);
printf0("WARNING: Fell back to cudaMallocManaged when initializing m,v,master_weights on %d GPUs\n", reduced_memory_status);
printf0(" Prevents an OOM, but code may run much slower due to device <-> host memory movement\n");
printf0(" Prevents an OOM, but code may run much slower due to device <-> host memory movement\n");
}
}
// report on device memory usage
// report on device memory usage
size_t free, total;
size_t free, total;
cudaCheck(cudaMemGetInfo(&free, &total));
cudaCheck(cudaMemGetInfo(&free, &total));
printf0("device memory usage: %zd MiB / %zd MiB\n", (total-free) / 1024 / 1024, total / 1024 / 1024);
printf0("device memory usage: %zd MiB / %zd MiB\n", (total-free) / 1024 / 1024, total / 1024 / 1024);
// give an estimate of the maximum batch size
// give an estimate of the maximum batch size
size_t bytes_per_sequence = 0;
size_t bytes_per_sequence = 0;
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
bytes_per_sequence += model->acts_specs[i].size * sizeof_dtype(model->acts_specs[i].type) / B;
bytes_per_sequence += model->acts_specs[i].size * sizeof_dtype(model->acts_specs[i].type) / B;
}
}
printf0("memory per sequence: %zu MiB\n", bytes_per_sequence / 1024 / 1024);
printf0("memory per sequence: %zu MiB\n", bytes_per_sequence / 1024 / 1024);
printf0(" -> estimated maximum batch size: %zu\n", B + free / bytes_per_sequence);
printf0(" -> estimated maximum batch size: %zu\n", B + free / bytes_per_sequence);
}
}


void GPT_write_to_checkpoint(GPT *model, const char* checkpoint_path) {
void GPT_write_to_checkpoint(GPT *model, const char* checkpoint_path) {
// write the model to a checkpoint file
// write the model to a checkpoint file
printf0("Writing model to %s\n", checkpoint_path);
printf0("Writing model to %s\n", checkpoint_path);
FILE *model_file = fopenCheck(checkpoint_path, "wb");
FILE *model_file = fopenCheck(checkpoint_path, "wb");
// write the header first
// write the header first
int model_header[256];
int model_header[256];
memset(model_header, 0, sizeof(model_header));
memset(model_header, 0, sizeof(model_header));
model_header[0] = 20240326; // magic number
model_header[0] = 20240326; // magic number
assert(PRECISION_MODE == PRECISION_FP32 || PRECISION_MODE == PRECISION_BF16);
assert(PRECISION_MODE == PRECISION_FP32 || PRECISION_MODE == PRECISION_BF16);
model_header[1] = PRECISION_MODE == PRECISION_FP32 ? 3 : 5; // version
model_header[1] = PRECISION_MODE == PRECISION_FP32 ? 3 : 5; // version
model_header[2] = model->config.max_seq_len;
model_header[2] = model->config.max_seq_len;
model_header[3] = model->config.vocab_size;
model_header[3] = model->config.vocab_size;
model_header[4] = model->config.num_layers;
model_header[4] = model->config.num_layers;
model_header[5] = model->config.num_heads;
model_header[5] = model->config.num_heads;
model_header[6] = model->config.channels;
model_header[6] = model->config.channels;
model_header[7] = model->config.padded_vocab_size;
model_header[7] = model->config.padded_vocab_size;
fwriteCheck(model_header, sizeof(int), 256, model_file);
fwriteCheck(model_header, sizeof(int), 256, model_file);
// write the parameters
// write the parameters
device_to_file(model_file, model->params_memory, model->num_parameters_bytes,
device_to_file(model_file, model->params_memory, model->num_parameters_bytes,
IO_BUF_SIZE, main_stream);
IO_BUF_SIZE, main_stream);
// close file, we're done
// close file, we're done
fcloseCheck(model_file);
fcloseCheck(model_file);
}
}


void GPT_build_from_checkpoint(GPT *model, const char* checkpoint_path, bool weight_init=true) {
void GPT_build_from_checkpoint(GPT *model, const char* checkpoint_path, bool weight_init=true) {
// If weight_init is true, we will load the weights from this checkpoint .bin file
// If weight_init is true, we will load the weights from this checkpoint .bin file
// We sometimes want this to be false, if we are going to initialize these weights from
// We sometimes want this to be false, if we are going to initialize these weights from
// the master weights that are instead stored in the state .bin file.
// the master weights that are instead stored in the state .bin file.
// In that case, this function mostly loads the model hyperparameters from the header.
// In that case, this function mostly loads the model hyperparameters from the header.


if (PRECISION_MODE == PRECISION_FP16) {
if (PRECISION_MODE == PRECISION_FP16) {
// TODO for later perhaps, would require us dynamically converting the
// TODO for later perhaps, would require us dynamically converting the
// model weights from fp32 to fp16 online, here in this function, or writing
// model weights from fp32 to fp16 online, here in this function, or writing
// the fp16 weights directly from Python, which we only do for fp32/bf16 atm.
// the fp16 weights directly from Python, which we only do for fp32/bf16 atm.
fprintf(stderr, "build_from_checkpoint() does not support fp16 right now.\n");
fprintf(stderr, "build_from_checkpoint() does not support fp16 right now.\n");
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
}


// read in model from a checkpoint file
// read in model from a checkpoint file
FILE *model_file = fopenCheck(checkpoint_path, "rb");
FILE *model_file = fopenCheck(checkpoint_path, "rb");
int model_header[256];
int model_header[256];
freadCheck(model_header, sizeof(int), 256, model_file);
freadCheck(model_header, sizeof(int), 256, model_file);
if (model_header[0] != 20240326) { printf("Bad magic model file\n"); exit(EXIT_FAILURE); }
if (model_header[0] != 20240326) { printf("Bad magic model file\n"); exit(EXIT_FAILURE); }
int version = model_header[1];
int version = model_header[1];
if (!(version == 3 || version == 5)) {
if (!(version == 3 || version == 5)) {
// 3 = fp32, padded vocab
// 3 = fp32, padded vocab
// 5 = bf16, padded vocab, layernorms also in bf16
// 5 = bf16, padded vocab, layernorms also in bf16
fprintf(stderr, "Bad version in model file\n");
fprintf(stderr, "Bad version in model file\n");
fprintf(stderr, "---> HINT: try to re-run `python train_GPT.py`\n");
fprintf(stderr, "---> HINT: try to re-run `python train_GPT.py`\n");
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
}


// check if the precision mode of the checkpoing matches the model precision
// check if the precision mode of the checkpoing matches the model precision
if (weight_init) {
if (weight_init) {
if (PRECISION_MODE == PRECISION_BF16 && version != 5) {
if (PRECISION_MODE == PRECISION_BF16 && version != 5) {
fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path);
fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path);
fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n");
fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n");
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
}
if (PRECISION_MODE == PRECISION_FP32 && version != 3) {
if (PRECISION_MODE == PRECISION_FP32 && version != 3) {
fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path);
fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path);
fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_GPTcu PRECISION=FP32`\n");
fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_GPTcu PRECISION=FP32`\n");
fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n");
fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n");
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
}
}
}


// read in hyperparameters
// read in hyperparameters
model->config.max_seq_len = model_header[2];
model->config.max_seq_len = model_header[2];
model->config.vocab_size = model_header[3];
model->config.vocab_size = model_header[3];
model->config.num_layers = model_header[4];
model->config.num_layers = model_header[4];
model->config.num_heads = model_header[5];
model->config.num_heads = model_header[5];
model->config.channels = model_header[6];
model->config.channels = model_header[6];
model->config.padded_vocab_size = model_header[7];
model->config.padded_vocab_size = model_header[7];


// allocate memory for the model parameters
// allocate memory for the model parameters
GPT_allocate_weights(model);
GPT_allocate_weights(model);


// read in the parameters if weight_init is true
// read in the parameters if weight_init is true
if (weight_init) {
if (weight_init) {
assert(model->params_memory != NULL);
assert(model->params_memory != NULL);
file_to_device(model->params_memory, model_file, model->num_parameters_bytes, IO_BUF_SIZE, main_stream);
file_to_device(model->params_memory, model_file, model->num_parameters_bytes, IO_BUF_SIZE, main_stream);
}
}
fcloseCheck(model_file);
fcloseCheck(model_file);


// only return from this function once we are certain the params are ready on the GPU
// only return from this function once we are certain the params are ready on the GPU
cudaCheck(cudaDeviceSynchronize());
cudaCheck(cudaDeviceSynchronize());
}
}


void GPT_set_hyperparameters(GPTConfig* config, int option) {
void GPT_set_hyperparameters(GPTConfig* config, int option) {
int channels, num_heads, num_layers;
int channels, num_heads, num_layers;
if (option == 1) { num_layers = 6; channels = 384; num_heads = 6; } // (unofficial) GPT-tiny (30M)
if (option == 1) { num_layers = 6; channels = 384; num_heads = 6; } // (unofficial) GPT-tiny (30M)
else if (option == 2) { num_layers = 12; channels = 768; num_heads = 12; } // GPT (124M)
else if (option == 2) { num_layers = 12; channels = 768; num_heads = 12; } // GPT (124M)
else { fprintf(stderr, "Unsupported GPT-2 depth: %d\n", num_layers); exit(EXIT_FAILURE); }
else { fprintf(stderr, "Unsupported GPT-2 depth: %d\n", num_layers); exit(EXIT_FAILURE); }


config->max_seq_len = 1024;
config->max_seq_len = 1024;
config->vocab_size = 50257;
config->vocab_size = 50257;
config->num_layers = num_layers;
config->num_layers = num_layers;
config->num_heads = num_heads;
config->num_heads = num_heads;
config->channels = channels;
config->channels = channels;
config->padded_vocab_size = 50304;
config->padded_vocab_size = 50304;
}
}


void gpt_innit_weights(GPT *model) {
void gpt_innit_weights(GPT *model) {


// allocate and random init the memory for all the parameters with GPT-2 schema
// allocate and random init the memory for all the parameters with GPT-2 schema
// weights ~N(0, 0.02), biases 0, c_proj weights ~N(0, 0.02/(2*L)**0.5)
// weights ~N(0, 0.02), biases 0, c_proj weights ~N(0, 0.02/(2*L)**0.5)
// NOTE: assuming all parameters are of the type floatX, could be relaxed later
mt19937_state init_rng;
manual_seed(&init_rng, 42);
floatX* params_memory_cpu = (floatX*)mallocCheck(model->num_parameters_bytes);
memset(params_memory_cpu, 0, model->num_parameters_bytes);
// fill in all the weights with random