What changes here?
20 removals
538 lines
13 additions
535 lines
/*
/*
Version 2 of the code, removing bias from forward pass
Version 2 of the code, removing bias from forward pass
*/
*/
#include <unistd.h>
#include <unistd.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#include <stdarg.h>
#include <stdarg.h>
#include <string>
#include <string>
#include <string_view>
#include <string_view>
#include <sys/stat.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/types.h>
// ----------- CPU utilities -----------
// ----------- CPU utilities -----------
// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
// defines: create_dir_if_not_exists, find_max_step, ends_with_bin
// defines: create_dir_if_not_exists, find_max_step, ends_with_bin
#include "llmc/utils.h"
#include "llmc/utils.h"
// defines: tokenizer_init, tokenizer_decode, tokenizer_free
// defines: tokenizer_init, tokenizer_decode, tokenizer_free
#include "llmc/tokenizer.h"
#include "llmc/tokenizer.h"
// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
// defines: evalloader_init, evalloader_reset, evalloader_next_batch, evalloader_free
// defines: evalloader_init, evalloader_reset, evalloader_next_batch, evalloader_free
#include "llmc/dataloader.h"
#include "llmc/dataloader.h"
// defines: manual_seed, normal_ (same as torch.manual_seed and torch.normal)
// defines: manual_seed, normal_ (same as torch.manual_seed and torch.normal)
#include "llmc/rand.h"
#include "llmc/rand.h"
// defines: lr_scheduler_init, get_learning_rate
// defines: lr_scheduler_init, get_learning_rate
#include "llmc/schedulers.h"
#include "llmc/schedulers.h"
// defines: sample_softmax, random_f32
// defines: sample_softmax, random_f32
#include "llmc/sampler.h"
#include "llmc/sampler.h"
// defines: logger_init, logger_log_eval, logger_log_val, logger_log_train
// defines: logger_init, logger_log_eval, logger_log_val, logger_log_train
#include "llmc/logger.h"
#include "llmc/logger.h"
// defines: get_flops_promised
// defines: get_flops_promised
#include "llmc/mfu.h"
#include "llmc/mfu.h"
// defines: OutlierDetector, init_detector, update_detector
// defines: OutlierDetector, init_detector, update_detector
#include "llmc/outlier_detector.h"
#include "llmc/outlier_detector.h"
// ----------- GPU utilities -----------
// ----------- GPU utilities -----------
// defines:
// defines:
// WARP_SIZE, MAX_1024_THREADS_BLOCKS, CEIL_DIV, cudaCheck, PRECISION_MODE
// WARP_SIZE, MAX_1024_THREADS_BLOCKS, CEIL_DIV, cudaCheck, PRECISION_MODE
// NVTX_RANGE_FN
// NVTX_RANGE_FN
#include "llmc/cuda_common.h"
#include "llmc/cuda_common.h"
// defines:
// defines:
// Packed128, f128, x128
// Packed128, f128, x128
// warpReduceSum, warpReduceMax, blockReduce, copy_and_cast_kernel, cudaMallocConditionallyManaged
// warpReduceSum, warpReduceMax, blockReduce, copy_and_cast_kernel, cudaMallocConditionallyManaged
#include "llmc/cuda_utils.cuh"
#include "llmc/cuda_utils.cuh"
// defines: CUBLAS_LOWP, cublasCheck, cublaslt_workspace_size, cublaslt_workspace
// defines: CUBLAS_LOWP, cublasCheck, cublaslt_workspace_size, cublaslt_workspace
// defines: cublas_compute, cublaslt_handle, cublas_handle
// defines: cublas_compute, cublaslt_handle, cublas_handle
#include "llmc/cublas_common.h"
#include "llmc/cublas_common.h"
// ----------- Layer implementations in CUDA -----------
// ----------- Layer implementations in CUDA -----------
// defines: encoder_forward, encoder_backward
// defines: encoder_forward, encoder_backward
#include "llmc/encoder.cuh"
#include "llmc/encoder.cuh"
// defines: layernorm_forward, residual_forward, fused_residual_forward5, layernorm_backward
// defines: layernorm_forward, residual_forward, fused_residual_forward5, layernorm_backward
#include "llmc/layernorm.cuh"
#include "llmc/layernorm.cuh"
// defines: matmul_cublaslt, matmul_forward, matmul_backward, gelu_forward, gelu_backward_inplace
// defines: matmul_cublaslt, matmul_forward, matmul_backward, gelu_forward, gelu_backward_inplace
#include "llmc/matmul.cuh"
#include "llmc/matmul.cuh"
#ifdef ENABLE_CUDNN
#ifdef ENABLE_CUDNN
// defines: create_cudnn, destroy_cudnn, attention_forward_cudnn, attention_backward_cudnn
// defines: create_cudnn, destroy_cudnn, attention_forward_cudnn, attention_backward_cudnn
#include "llmc/cudnn_att.h"
#include "llmc/cudnn_att.h"
#else
#else
// defines: attention_forward, attention_backward
// defines: attention_forward, attention_backward
#include "llmc/attention.cuh"
#include "llmc/attention.cuh"
#endif
#endif
// defines: fused_classifier
// defines: fused_classifier
#include "llmc/fused_classifier.cuh"
#include "llmc/fused_classifier.cuh"
// defines: adamw_kernel3
// defines: adamw_kernel3
#include "llmc/adamw.cuh"
#include "llmc/adamw.cuh"
// defines: global_norm_squared
// defines: global_norm_squared
#include "llmc/global_norm.cuh"
#include "llmc/global_norm.cuh"
// ----------- Multi-GPU support -----------
// ----------- Multi-GPU support -----------
// defines: ncclFloatX, ncclCheck, MultiGpuConfig, ShardInfo
// defines: ncclFloatX, ncclCheck, MultiGpuConfig, ShardInfo
// defines: printf0, multi_gpu_config
// defines: printf0, multi_gpu_config
// defines: multi_gpu_config_init, multi_gpu_config_free
// defines: multi_gpu_config_init, multi_gpu_config_free
// defines: set_zero_configs, multi_gpu_cpu_float_sum, multi_gpu_barrier
// defines: set_zero_configs, multi_gpu_cpu_float_sum, multi_gpu_barrier
// defines: multi_gpu_get_shard_offset, multi_gpu_async_reduce_gradient
// defines: multi_gpu_get_shard_offset, multi_gpu_async_reduce_gradient
#include "llmc/zero.cuh"
#include "llmc/zero.cuh"
// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
// global vars for I/O
// global vars for I/O
char filename_buffer[512];
char filename_buffer[512];
// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
// global vars containing information about the GPU this process is running on
// global vars containing information about the GPU this process is running on
cudaDeviceProp deviceProp; // fills in common_start()
cudaDeviceProp deviceProp; // fills in common_start()
cudaStream_t main_stream;
cudaStream_t main_stream;
// buffer size to use for device <-> disk io
// buffer size to use for device <-> disk io
constexpr const size_t IO_BUF_SIZE = 32 * 1024 * 1024;
constexpr const size_t IO_BUF_SIZE = 32 * 1024 * 1024;
// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
// GPT-2 model definition
// GPT-2 model definition
typedef struct {
typedef struct {
int max_seq_len; // max sequence length, e.g. 1024
int max_seq_len; // max sequence length, e.g. 1024
int vocab_size; // vocab size, e.g. 50257
int vocab_size; // vocab size, e.g. 50257
int padded_vocab_size; // padded to e.g. %128==0, 50304
int padded_vocab_size; // padded to e.g. %128==0, 50304
int num_layers; // number of layers, e.g. 12
int num_layers; // number of layers, e.g. 12
int num_heads; // number of heads in attention, e.g. 12
int num_heads; // number of heads in attention, e.g. 12
int channels; // number of channels, e.g. 768
int channels; // number of channels, e.g. 768
} GPTConfig;
} GPTConfig;
// the parameters of the model
// the parameters of the model
constexpr const int NUM_PARAMETER_TENSORS = 16;
constexpr const int NUM_PARAMETER_TENSORS = 12;
typedef struct {
typedef struct {
floatX* wte; // (V, C)
floatX* wte; // (V, C)
floatX* wpe; // (maxT, C)
floatX* wpe; // (maxT, C)
floatX* ln1w; // (L, C)
floatX* ln1w; // (L, C)
floatX* ln1b; // (L, C)
floatX* ln1b; // (L, C)
floatX* qkvw; // (L, 3*C, C)
floatX* qkvw; // (L, 3*C, C)
floatX* qkvb; // (L, 3*C)
floatX* attprojw; // (L, C, C)
floatX* attprojw; // (L, C, C)
floatX* attprojb; // (L, C)
floatX* ln2w; // (L, C)
floatX* ln2w; // (L, C)
floatX* ln2b; // (L, C)
floatX* ln2b; // (L, C)
floatX* fcw; // (L, 4*C, C)
floatX* fcw; // (L, 4*C, C)
floatX* fcb; // (L, 4*C)
floatX* fcprojw; // (L, C, 4*C)
floatX* fcprojw; // (L, C, 4*C)
floatX* fcprojb; // (L, C)
floatX* lnfw; // (C)
floatX* lnfw; // (C)
floatX* lnfb; // (C)
floatX* lnfb; // (C)
} ParameterTensors;
} ParameterTensors;
static_assert(sizeof(ParameterTensors) == NUM_PARAMETER_TENSORS * sizeof(void*), "Inconsistent sizes!");
static_assert(sizeof(ParameterTensors) == NUM_PARAMETER_TENSORS * sizeof(void*), "Inconsistent sizes!");
void fill_in_parameter_sizes(size_t* param_sizes, size_t* param_sizeof, GPTConfig config) {
void fill_in_parameter_sizes(size_t* param_sizes, size_t* param_sizeof, GPTConfig config) {
size_t Vp = config.padded_vocab_size;
size_t Vp = config.padded_vocab_size;
size_t C = config.channels;
size_t C = config.channels;
size_t maxT = config.max_seq_len;
size_t maxT = config.max_seq_len;
size_t L = config.num_layers;
size_t L = config.num_layers;
param_sizes[0] = Vp * C; // wte
param_sizes[0] = Vp * C; // wte
param_sizes[1] = maxT * C; // wpe
param_sizes[1] = maxT * C; // wpe
param_sizes[2] = L * C; // ln1w
param_sizes[2] = L * C; // ln1w
param_sizes[3] = L * C; // ln1b
param_sizes[3] = L * C; // ln1b
param_sizes[4] = L * (3 * C) * C; // qkvw
param_sizes[4] = L * (3 * C) * C; // qkvw
param_sizes[5] = 16; // qkvb
param_sizes[5] = L * C * C; // attprojw
param_sizes[6] = L * C * C; // attprojw
param_sizes[6] = L * C; // ln2w
param_sizes[7] = 16; // attprojb
param_sizes[7] = L * C; // ln2b
param_sizes[8] = L * C; // ln2w
param_sizes[8] = L * (4 * C) * C; // fcw
param_sizes[9] = L * C; // ln2b
param_sizes[9] = L * C * (4 * C); // fcprojw
param_sizes[10] = L * (4 * C) * C; // fcw
param_sizes[10] = C; // lnfw
param_sizes[11] = 16; // fcb
param_sizes[11] = C; // lnfb
param_sizes[12] = L * C * (4 * C); // fcprojw
param_sizes[13] = 16; // fcprojb
param_sizes[14] = C; // lnfw
param_sizes[15] = C; // lnfb
// populate the parameter sizes in bytes (all the same for now, keeping for future use)
// populate the parameter sizes in bytes (all the same for now, keeping for future use)
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
param_sizeof[i] = sizeof(floatX);
param_sizeof[i] = sizeof(floatX);
}
}
}
}
// allocate memory for the parameters and point the individual tensors to the right places
// allocate memory for the parameters and point the individual tensors to the right places
void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elements, size_t *param_sizeof) {
void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elements, size_t *param_sizeof) {
// calculate the total number of parameters and bytes across all tensors
// calculate the total number of parameters and bytes across all tensors
size_t num_parameters_bytes = 0;
size_t num_parameters_bytes = 0;
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
num_parameters_bytes += param_elements[i] * param_sizeof[i];
num_parameters_bytes += param_elements[i] * param_sizeof[i];
}
}
// malloc all parameters all at once on the device
// malloc all parameters all at once on the device
void* params_memory;
void* params_memory;
cudaCheck(cudaMalloc((void**)¶ms_memory, num_parameters_bytes));
cudaCheck(cudaMalloc((void**)¶ms_memory, num_parameters_bytes));
// assign all the tensors their place in the array
// assign all the tensors their place in the array
floatX** ptrs[] = {
floatX** ptrs[] = {
¶ms->wte, ¶ms->wpe, ¶ms->ln1w, ¶ms->ln1b, ¶ms->qkvw, ¶ms->qkvb,
¶ms->wte, ¶ms->wpe, ¶ms->ln1w, ¶ms->ln1b, ¶ms->qkvw,
¶ms->attprojw, ¶ms->attprojb, ¶ms->ln2w, ¶ms->ln2b, ¶ms->fcw, ¶ms->fcb,
¶ms->attprojw, ¶ms->ln2w, ¶ms->ln2b, ¶ms->fcw,
¶ms->fcprojw, ¶ms->fcprojb, ¶ms->lnfw, ¶ms->lnfb
¶ms->fcprojw, ¶ms->lnfw, ¶ms->lnfb
};
};
char* params_memory_iterator = (char*)params_memory;
char* params_memory_iterator = (char*)params_memory;
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
*(ptrs[i]) = (floatX*)params_memory_iterator;
*(ptrs[i]) = (floatX*)params_memory_iterator;
params_memory_iterator += param_elements[i] * param_sizeof[i];
params_memory_iterator += param_elements[i] * param_sizeof[i];
}
}
return params_memory;
return params_memory;
}
}
constexpr int NUM_ACTIVATION_TENSORS = 21;
constexpr int NUM_ACTIVATION_TENSORS = 21;
typedef struct {
typedef struct {
floatX* encoded; // (B, T, C)
floatX* encoded; // (B, T, C)
floatX* ln1; // (L, B, T, C)
floatX* ln1; // (L, B, T, C)
float* ln1_mean; // (L, B, T)
float* ln1_mean; // (L, B, T)
float* ln1_rstd; // (L, B, T)
float* ln1_rstd; // (L, B, T)
floatX* atty; // (L, B, T, C)
floatX* atty; // (L, B, T, C)
// cuDNN saves only some statistics information
// cuDNN saves only some statistics information
#if ENABLE_CUDNN
#if ENABLE_CUDNN
float* att; // (L, B, NH, T)
float* att; // (L, B, NH, T)
#else
#else
floatX* att; // (L, B, NH, T, T)
floatX* att; // (L, B, NH, T, T)
#endif
#endif
floatX* residual2; // (L, B, T, C)
floatX* residual2; // (L, B, T, C)
floatX* ln2; // (L, B, T, C)
floatX* ln2; // (L, B, T, C)
float* ln2_mean; // (L, B, T)
float* ln2_mean; // (L, B, T)
float* ln2_rstd; // (L, B, T)
float* ln2_rstd; // (L, B, T)
floatX* fch; // (L, B, T, 4*C)
floatX* fch; // (L, B, T, 4*C)
floatX* fch_gelu; // (L, B, T, 4*C)
floatX* fch_gelu; // (L, B, T, 4*C)
floatX* residual3; // (L, B, T, C)
floatX* residual3; // (L, B, T, C)
floatX* lnf; // (B, T, C); if LN recomputation is enabled (-r 2 and above), will be used for _all_ layernorms
floatX* lnf; // (B, T, C); if LN recomputation is enabled (-r 2 and above), will be used for _all_ layernorms
float* lnf_mean; // (B, T)
float* lnf_mean; // (B, T)
float* lnf_rstd; // (B, T)
float* lnf_rstd; // (B, T)
float* losses; // (B, T), will be accumulated in micro-steps
float* losses; // (B, T), will be accumulated in micro-steps
// adding these two compared to the CPU .c code, needed for attention kernel as buffers
// adding these two compared to the CPU .c code, needed for attention kernel as buffers
floatX* qkvr; // (L, B, T, 3*C)
floatX* qkvr; // (L, B, T, 3*C)
// in inference mode, this buffer will store the logits
// in inference mode, this buffer will store the logits
// in training mode, this buffer will contain the *gradients* of the logits.
// in training mode, this buffer will contain the *gradients* of the logits.
// during the processing of transformer blocks, we will also use this as a
// during the processing of transformer blocks, we will also use this as a
// general scratchpad buffer. Allocation is made large enough to hold (B, T, 3C),
// general scratchpad buffer. Allocation is made large enough to hold (B, T, 3C),
// (B, NH, T, T), and (B, T, V) shaped tensors.
// (B, NH, T, T), and (B, T, V) shaped tensors.
floatX* output;
floatX* output;
// some additional scratch buffers
// some additional scratch buffers
floatX* scratch_bt4c; // (B, T, 4*C)
floatX* scratch_bt4c; // (B, T, 4*C)
floatX* scratch_btc; // (B, T, C)
floatX* scratch_btc; // (B, T, C)
} ActivationTensors;
} ActivationTensors;
struct TensorSpec {
struct TensorSpec {
void** ptr;
void** ptr;
size_t size;
size_t size;
DType type;
DType type;
};
};
#define TENSOR_SPEC(pointer, size) TensorSpec{(void**)(&pointer), (size), dtype_of(pointer)};
#define TENSOR_SPEC(pointer, size) TensorSpec{(void**)(&pointer), (size), dtype_of(pointer)};
void fill_in_activation_sizes(const ActivationTensors* data, TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS], size_t B, size_t T, GPTConfig config, int recompute) {
void fill_in_activation_sizes(const ActivationTensors* data, TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS], size_t B, size_t T, GPTConfig config, int recompute) {
size_t Vp = config.padded_vocab_size;
size_t Vp = config.padded_vocab_size;
size_t L = config.num_layers;
size_t L = config.num_layers;
size_t NH = config.num_heads;
size_t NH = config.num_heads;
size_t C = config.channels;
size_t C = config.channels;
tensors[0] = TENSOR_SPEC(data->encoded, B * T * C);
tensors[0] = TENSOR_SPEC(data->encoded, B * T * C);
// if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
// if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
tensors[1] = TENSOR_SPEC(data->ln1, (recompute < 2) ? L * B * T * C : 0);
tensors[1] = TENSOR_SPEC(data->ln1, (recompute < 2) ? L * B * T * C : 0);
tensors[2] = TENSOR_SPEC(data->ln1_mean, L * B * T);
tensors[2] = TENSOR_SPEC(data->ln1_mean, L * B * T);
tensors[3] = TENSOR_SPEC(data->ln1_rstd, L * B * T);
tensors[3] = TENSOR_SPEC(data->ln1_rstd, L * B * T);
tensors[4] = TENSOR_SPEC(data->atty, L * B * T * C);
tensors[4] = TENSOR_SPEC(data->atty, L * B * T * C);
#ifdef ENABLE_CUDNN
#ifdef ENABLE_CUDNN
// FP32 stats tensor for cuDNN to be passed to backward pass
// FP32 stats tensor for cuDNN to be passed to backward pass
tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T);
tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T);
#else
#else
tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T * T);
tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T * T);
#endif
#endif
tensors[6] = TENSOR_SPEC(data->residual2, L * B * T * C);
tensors[6] = TENSOR_SPEC(data->residual2, L * B * T * C);
// if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
// if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
tensors[7] = TENSOR_SPEC(data->ln2, (recompute < 2) ? L * B * T * C : 0);
tensors[7] = TENSOR_SPEC(data->ln2, (recompute < 2) ? L * B * T * C : 0);
tensors[8] = TENSOR_SPEC(data->ln2_mean, L * B * T);
tensors[8] = TENSOR_SPEC(data->ln2_mean, L * B * T);
tensors[9] = TENSOR_SPEC(data->ln2_rstd, L * B * T);
tensors[9] = TENSOR_SPEC(data->ln2_rstd, L * B * T);
tensors[10] = TENSOR_SPEC(data->fch, L * B * T * 4*C);
tensors[10] = TENSOR_SPEC(data->fch, L * B * T * 4*C);
// if recompute >= 1 then we will recompute gelu_forward during backward and use this as scratch buffer
// if recompute >= 1 then we will recompute gelu_forward during backward and use this as scratch buffer
tensors[11] = TENSOR_SPEC(data->fch_gelu, (recompute < 1) ? L * B * T * 4*C : B * T * 4*C);
tensors[11] = TENSOR_SPEC(data->fch_gelu, (recompute < 1) ? L * B * T * 4*C : B * T * 4*C);
tensors[12] = TENSOR_SPEC(data->residual3, L * B * T * C);
tensors[12] = TENSOR_SPEC(data->residual3, L * B * T * C);
tensors[13] = TENSOR_SPEC(data->lnf, B * T * C);
tensors[13] = TENSOR_SPEC(data->lnf, B * T * C);
tensors[14] = TENSOR_SPEC(data->lnf_mean, B * T);
tensors[14] = TENSOR_SPEC(data->lnf_mean, B * T);
tensors[15] = TENSOR_SPEC(data->lnf_rstd, B * T);
tensors[15] = TENSOR_SPEC(data->lnf_rstd, B * T);
tensors[16] = TENSOR_SPEC(data->losses, B * T);
tensors[16] = TENSOR_SPEC(data->losses, B * T);
tensors[17] = TENSOR_SPEC(data->qkvr, L * B * T * 3*C);
tensors[17] = TENSOR_SPEC(data->qkvr, L * B * T * 3*C);
tensors[18] = TENSOR_SPEC(data->output, B * T * max(3*C, max(NH*T, Vp)));
tensors[18] = TENSOR_SPEC(data->output, B * T * max(3*C, max(NH*T, Vp)));
tensors[19] = TENSOR_SPEC(data->scratch_bt4c, B * T * 4 * C);
tensors[19] = TENSOR_SPEC(data->scratch_bt4c, B * T * 4 * C);
tensors[20] = TENSOR_SPEC(data->scratch_btc, B * T * C);
tensors[20] = TENSOR_SPEC(data->scratch_btc, B * T * C);
}
}
void* malloc_and_point_activations(TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS]) {
void* malloc_and_point_activations(TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS]) {
size_t bytes = 0;
size_t bytes = 0;
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
bytes += tensors[i].size * sizeof_dtype(tensors[i].type);
bytes += tensors[i].size * sizeof_dtype(tensors[i].type);
}
}
printf0("allocating %d MiB for activations\n", (int)round(bytes / (1024 * 1024)));
printf0("allocating %d MiB for activations\n", (int)round(bytes / (1024 * 1024)));
void* acts_memory;
void* acts_memory;
cudaCheck(cudaMalloc((void**)&acts_memory, bytes));
cudaCheck(cudaMalloc((void**)&acts_memory, bytes));
// cudaMalloc does not guarantee initial memory values so we memset the allocation here
// cudaMalloc does not guarantee initial memory values so we memset the allocation here
// this matters because e.g. non-cuDNN attention assumes the attention buffer is zeroed
// this matters because e.g. non-cuDNN attention assumes the attention buffer is zeroed
// todo - up to ~100ms on slow GPUs, could theoretically be more selective, but this is safer
// todo - up to ~100ms on slow GPUs, could theoretically be more selective, but this is safer
cudaCheck(cudaMemset(acts_memory, 0, bytes));
cudaCheck(cudaMemset(acts_memory, 0, bytes));
char* acts_memory_iterator = (char*)acts_memory;
char* acts_memory_iterator = (char*)acts_memory;
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
// extra protection so we don't accidentally use an empty buffer
// extra protection so we don't accidentally use an empty buffer
if(tensors[i].size == 0) {
if(tensors[i].size == 0) {
*(tensors[i].ptr) = NULL;
*(tensors[i].ptr) = NULL;
}else {
}else {
*(tensors[i].ptr) = acts_memory_iterator;
*(tensors[i].ptr) = acts_memory_iterator;
acts_memory_iterator += tensors[i].size * sizeof_dtype(tensors[i].type);
acts_memory_iterator += tensors[i].size * sizeof_dtype(tensors[i].type);
}
}
}
}
return acts_memory;
return acts_memory;
}
}
typedef struct {
typedef struct {
GPTConfig config;
GPTConfig config;
// the weights of the model, and their sizes
// the weights of the model, and their sizes
ParameterTensors params;
ParameterTensors params;
size_t param_elements[NUM_PARAMETER_TENSORS];
size_t param_elements[NUM_PARAMETER_TENSORS];
size_t param_sizeof[NUM_PARAMETER_TENSORS];
size_t param_sizeof[NUM_PARAMETER_TENSORS];
void* params_memory;
void* params_memory;
size_t num_parameters;
size_t num_parameters;
size_t num_parameters_bytes;
size_t num_parameters_bytes;
// gradients of the weights
// gradients of the weights
ParameterTensors grads;
ParameterTensors grads;
void* grads_memory;
void* grads_memory;
// buffers for the AdamW optimizer
// buffers for the AdamW optimizer
float* m_memory;
float* m_memory;
float* v_memory;
float* v_memory;
float* master_weights; // is NULL unless fp32 weights is enabled.
float* master_weights; // is NULL unless fp32 weights is enabled.
// the activations of the model, and their sizes
// the activations of the model, and their sizes
ActivationTensors acts;
ActivationTensors acts;
TensorSpec acts_specs[NUM_ACTIVATION_TENSORS];
TensorSpec acts_specs[NUM_ACTIVATION_TENSORS];
void* acts_memory;
void* acts_memory;
// other run state configuration
// other run state configuration
int batch_size; // the batch size (B) of current forward pass
int batch_size; // the batch size (B) of current forward pass
int seq_len; // the sequence length (T) of current forward pass
int seq_len; // the sequence length (T) of current forward pass
int* inputs; // the input tokens for the current forward pass
int* inputs; // the input tokens for the current forward pass
int* targets; // the target tokens for the current forward pass
int* targets; // the target tokens for the current forward pass
float mean_loss; // after the last backward micro-batch, will be populated with mean loss across all GPUs and micro-steps
float mean_loss; // after the last backward micro-batch, will be populated with mean loss across all GPUs and micro-steps
float* accumulated_mean_loss; // GPU buffer used to accumulate loss across micro-steps
float* accumulated_mean_loss; // GPU buffer used to accumulate loss across micro-steps
float* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
float* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
unsigned long long rng_state_last_update; // RNG before last GPT_update() to re-round identically from master weights
unsigned long long rng_state_last_update; // RNG before last GPT_update() to re-round identically from master weights
int use_master_weights; // keep master weights copy in float for optim update? 0|1
int use_master_weights; // keep master weights copy in float for optim update? 0|1
bool init_state; // set to true if master weights need to be initialized
bool init_state; // set to true if master weights need to be initialized
int gelu_fusion; // fuse gelu via cuBLASLt (0=none, 1=forward, 2=forward+backward)
int gelu_fusion; // fuse gelu via cuBLASLt (0=none, 1=forward, 2=forward+backward)
int recompute; // recompute gelu | layernorm forward during model backward? 0|1|2
int recompute; // recompute gelu | layernorm forward during model backward? 0|1|2
// todo - if other functions need cpu scratch buffers in the future, reuse as generic scratch?
// todo - if other functions need cpu scratch buffers in the future, reuse as generic scratch?
int* workload_indices; // encoder_backward, B*T*num_c_groups (int)
int* workload_indices; // encoder_backward, B*T*num_c_groups (int)
int4* bucket_info; // encoder_backward, B*T*num_c_groups (int4) - size for worst case
int4* bucket_info; // encoder_backward, B*T*num_c_groups (int4) - size for worst case
} GPT;
} GPT;
void GPT_init_settings(GPT *model) {
void GPT_init_settings(GPT *model) {
// common inits outside of the model weights
// common inits outside of the model weights
// memory lazily initialized in forward()
// memory lazily initialized in forward()
model->acts_memory = NULL;
model->acts_memory = NULL;
model->inputs = NULL;
model->inputs = NULL;
model->targets = NULL;
model->targets = NULL;
model->accumulated_mean_loss = NULL;
model->accumulated_mean_loss = NULL;
model->cpu_losses = NULL;
model->cpu_losses = NULL;
// the B,T params are determined and set, fixed on first batch in forward()
// the B,T params are determined and set, fixed on first batch in forward()
model->batch_size = 0;
model->batch_size = 0;
model->seq_len = 0;
model->seq_len = 0;
model->mean_loss = -1.0f; // -1.0f designates no loss, set at end of forward()
model->mean_loss = -1.0f; // -1.0f designates no loss, set at end of forward()
model->params_memory = NULL;
model->params_memory = NULL;
// memory lazily initialized in backward()
// memory lazily initialized in backward()
model->grads_memory = NULL;
model->grads_memory = NULL;
model->workload_indices = NULL; // on cpu, for encoder_backward
model->workload_indices = NULL; // on cpu, for encoder_backward
model->bucket_info = NULL; // on cpu, for encoder_backward
model->bucket_info = NULL; // on cpu, for encoder_backward
// memory lazily initialized in update()
// memory lazily initialized in update()
model->m_memory = NULL;
model->m_memory = NULL;
model->v_memory = NULL;
model->v_memory = NULL;
model->master_weights = NULL;
model->master_weights = NULL;
// other default settings
// other default settings
model->rng_state = 13371337 + multi_gpu_config.process_rank; // used in stochastic rounding
model->rng_state = 13371337 + multi_gpu_config.process_rank; // used in stochastic rounding
model->use_master_weights = 1; // safe default: do keep master weights in fp32
model->use_master_weights = 1; // safe default: do keep master weights in fp32
model->init_state = true;
model->init_state = true;
model->recompute = 1; // good default: recompute gelu but not layernorm
model->recompute = 1; // good default: recompute gelu but not layernorm
model->gelu_fusion = 0; //deviceProp.major >= 9 ? 2 : 0; // default: off for now (default must match main())
model->gelu_fusion = 0; //deviceProp.major >= 9 ? 2 : 0; // default: off for now (default must match main())
}
}
void GPT_allocate_weights(GPT *model) {
void GPT_allocate_weights(GPT *model) {
// fill in all the parameter tensor dimensions and types
// fill in all the parameter tensor dimensions and types
fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config);
fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config);
model->num_parameters = 0;
model->num_parameters = 0;
model->num_parameters_bytes = 0;
model->num_parameters_bytes = 0;
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
model->num_parameters += model->param_elements[i];
model->num_parameters += model->param_elements[i];
model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
}
}
// create memory for model parameters on the device
// create memory for model parameters on the device
assert(model->params_memory == nullptr);
assert(model->params_memory == nullptr);
model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
}
}
void GPT_allocate_state(GPT *model, int B, int T) {
void GPT_allocate_state(GPT *model, int B, int T) {
printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
assert(model->grads_memory == nullptr);
assert(model->grads_memory == nullptr);
model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof);
model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof);
// record the current B,T as well
// record the current B,T as well
model->batch_size = B;
model->batch_size = B;
model->seq_len = T;
model->seq_len = T;
// allocate the space
// allocate the space
fill_in_activation_sizes(&model->acts, model->acts_specs, B, T, model->config, model->recompute);
fill_in_activation_sizes(&model->acts, model->acts_specs, B, T, model->config, model->recompute);
model->acts_memory = malloc_and_point_activations(model->acts_specs);
model->acts_memory = malloc_and_point_activations(model->acts_specs);
// also create memory for caching inputs and targets
// also create memory for caching inputs and targets
cudaCheck(cudaMalloc((void**)&model->inputs, B * T * sizeof(int)));
cudaCheck(cudaMalloc((void**)&model->inputs, B * T * sizeof(int)));
cudaCheck(cudaMalloc((void**)&model->targets, B * T * sizeof(int)));
cudaCheck(cudaMalloc((void**)&model->targets, B * T * sizeof(int)));
cudaCheck(cudaMalloc(((void**)&model->accumulated_mean_loss), sizeof(float)));
cudaCheck(cudaMalloc(((void**)&model->accumulated_mean_loss), sizeof(float)));
cudaCheck(cudaMallocHost((void**)&model->cpu_losses, B * T * sizeof(float)));
cudaCheck(cudaMallocHost((void**)&model->cpu_losses, B * T * sizeof(float)));
// initialise cpu scratch buffers for encoder backward
// initialise cpu scratch buffers for encoder backward
size_t num_c_groups = CEIL_DIV(model->config.channels, (WARP_SIZE * x128::size));
size_t num_c_groups = CEIL_DIV(model->config.channels, (WARP_SIZE * x128::size));
assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL<<31ULL)); // todo - maybe an issue for llama3-400B(?)
assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL<<31ULL)); // todo - maybe an issue for llama3-400B(?)
model->workload_indices = (int*)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups);
model->workload_indices = (int*)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups);
model->bucket_info = (int4*)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups);
model->bucket_info = (int4*)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups);
// cudaMallocConditionallyManaged can fall back to cudaMallocManaged if not enough memory on device
// cudaMallocConditionallyManaged can fall back to cudaMallocManaged if not enough memory on device
// and returns a status code of 1 if it had to fall back, in that case we want to print warning.
// and returns a status code of 1 if it had to fall back, in that case we want to print warning.
int memory_status = 0;
int memory_status = 0;
// we will now init the optimizer states and master weights
// we will now init the optimizer states and master weights
// this is usually a substantial amount of memory allocation right here.
// this is usually a substantial amount of memory allocation right here.
size_t shard_num_parameters = multi_gpu_config.shard_num_parameters; // num parameters we are responsible for
size_t shard_num_parameters = multi_gpu_config.shard_num_parameters; // num parameters we are responsible for
printf0("allocating %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20);
printf0("allocating %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20);
printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
assert(model->m_memory == nullptr);
assert(model->m_memory == nullptr);
assert(model->v_memory == nullptr);
assert(model->v_memory == nullptr);
memory_status |= cudaMallocConditionallyManaged((void**)&model->m_memory, shard_num_parameters * sizeof(float));
memory_status |= cudaMallocConditionallyManaged((void**)&model->m_memory, shard_num_parameters * sizeof(float));
memory_status |= cudaMallocConditionallyManaged((void**)&model->v_memory, shard_num_parameters * sizeof(float));
memory_status |= cudaMallocConditionallyManaged((void**)&model->v_memory, shard_num_parameters * sizeof(float));
if (model->use_master_weights == 1) {
if (model->use_master_weights == 1) {
assert(model->master_weights == nullptr);
assert(model->master_weights == nullptr);
printf0("allocating %zu MiB for master copy of params\n", (shard_num_parameters * sizeof(float)) >> 20);
printf0("allocating %zu MiB for master copy of params\n", (shard_num_parameters * sizeof(float)) >> 20);
memory_status |= cudaMallocConditionallyManaged((void**) &model->master_weights, shard_num_parameters * sizeof(float));
memory_status |= cudaMallocConditionallyManaged((void**) &model->master_weights, shard_num_parameters * sizeof(float));
}
}
// report on mixed memory allocation status (re-using our float reduce function, bit awk ok)
// report on mixed memory allocation status (re-using our float reduce function, bit awk ok)
int reduced_memory_status = (int) multi_gpu_cpu_float_sum((float)memory_status, &multi_gpu_config);
int reduced_memory_status = (int) multi_gpu_cpu_float_sum((float)memory_status, &multi_gpu_config);
if (reduced_memory_status >= 1) {
if (reduced_memory_status >= 1) {
printf0("WARNING: Fell back to cudaMallocManaged when initializing m,v,master_weights on %d GPUs\n", reduced_memory_status);
printf0("WARNING: Fell back to cudaMallocManaged when initializing m,v,master_weights on %d GPUs\n", reduced_memory_status);
printf0(" Prevents an OOM, but code may run much slower due to device <-> host memory movement\n");
printf0(" Prevents an OOM, but code may run much slower due to device <-> host memory movement\n");
}
}
// report on device memory usage
// report on device memory usage
size_t free, total;
size_t free, total;
cudaCheck(cudaMemGetInfo(&free, &total));
cudaCheck(cudaMemGetInfo(&free, &total));
printf0("device memory usage: %zd MiB / %zd MiB\n", (total-free) / 1024 / 1024, total / 1024 / 1024);
printf0("device memory usage: %zd MiB / %zd MiB\n", (total-free) / 1024 / 1024, total / 1024 / 1024);
// give an estimate of the maximum batch size
// give an estimate of the maximum batch size
size_t bytes_per_sequence = 0;
size_t bytes_per_sequence = 0;
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
bytes_per_sequence += model->acts_specs[i].size * sizeof_dtype(model->acts_specs[i].type) / B;
bytes_per_sequence += model->acts_specs[i].size * sizeof_dtype(model->acts_specs[i].type) / B;
}
}
printf0("memory per sequence: %zu MiB\n", bytes_per_sequence / 1024 / 1024);
printf0("memory per sequence: %zu MiB\n", bytes_per_sequence / 1024 / 1024);
printf0(" -> estimated maximum batch size: %zu\n", B + free / bytes_per_sequence);
printf0(" -> estimated maximum batch size: %zu\n", B + free / bytes_per_sequence);
}
}
void GPT_write_to_checkpoint(GPT *model, const char* checkpoint_path) {
void GPT_write_to_checkpoint(GPT *model, const char* checkpoint_path) {
// write the model to a checkpoint file
// write the model to a checkpoint file
printf0("Writing model to %s\n", checkpoint_path);
printf0("Writing model to %s\n", checkpoint_path);
FILE *model_file = fopenCheck(checkpoint_path, "wb");
FILE *model_file = fopenCheck(checkpoint_path, "wb");
// write the header first
// write the header first
int model_header[256];
int model_header[256];
memset(model_header, 0, sizeof(model_header));
memset(model_header, 0, sizeof(model_header));
model_header[0] = 20240326; // magic number
model_header[0] = 20240326; // magic number
assert(PRECISION_MODE == PRECISION_FP32 || PRECISION_MODE == PRECISION_BF16);
assert(PRECISION_MODE == PRECISION_FP32 || PRECISION_MODE == PRECISION_BF16);
model_header[1] = PRECISION_MODE == PRECISION_FP32 ? 3 : 5; // version
model_header[1] = PRECISION_MODE == PRECISION_FP32 ? 3 : 5; // version
model_header[2] = model->config.max_seq_len;
model_header[2] = model->config.max_seq_len;
model_header[3] = model->config.vocab_size;
model_header[3] = model->config.vocab_size;
model_header[4] = model->config.num_layers;
model_header[4] = model->config.num_layers;
model_header[5] = model->config.num_heads;
model_header[5] = model->config.num_heads;
model_header[6] = model->config.channels;
model_header[6] = model->config.channels;
model_header[7] = model->config.padded_vocab_size;
model_header[7] = model->config.padded_vocab_size;
fwriteCheck(model_header, sizeof(int), 256, model_file);
fwriteCheck(model_header, sizeof(int), 256, model_file);
// write the parameters
// write the parameters
device_to_file(model_file, model->params_memory, model->num_parameters_bytes,
device_to_file(model_file, model->params_memory, model->num_parameters_bytes,
IO_BUF_SIZE, main_stream);
IO_BUF_SIZE, main_stream);
// close file, we're done
// close file, we're done
fcloseCheck(model_file);
fcloseCheck(model_file);
}
}
void GPT_build_from_checkpoint(GPT *model, const char* checkpoint_path, bool weight_init=true) {
void GPT_build_from_checkpoint(GPT *model, const char* checkpoint_path, bool weight_init=true) {
// If weight_init is true, we will load the weights from this checkpoint .bin file
// If weight_init is true, we will load the weights from this checkpoint .bin file
// We sometimes want this to be false, if we are going to initialize these weights from
// We sometimes want this to be false, if we are going to initialize these weights from
// the master weights that are instead stored in the state .bin file.
// the master weights that are instead stored in the state .bin file.
// In that case, this function mostly loads the model hyperparameters from the header.
// In that case, this function mostly loads the model hyperparameters from the header.
if (PRECISION_MODE == PRECISION_FP16) {
if (PRECISION_MODE == PRECISION_FP16) {
// TODO for later perhaps, would require us dynamically converting the
// TODO for later perhaps, would require us dynamically converting the
// model weights from fp32 to fp16 online, here in this function, or writing
// model weights from fp32 to fp16 online, here in this function, or writing
// the fp16 weights directly from Python, which we only do for fp32/bf16 atm.
// the fp16 weights directly from Python, which we only do for fp32/bf16 atm.
fprintf(stderr, "build_from_checkpoint() does not support fp16 right now.\n");
fprintf(stderr, "build_from_checkpoint() does not support fp16 right now.\n");
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
}
// read in model from a checkpoint file
// read in model from a checkpoint file
FILE *model_file = fopenCheck(checkpoint_path, "rb");
FILE *model_file = fopenCheck(checkpoint_path, "rb");
int model_header[256];
int model_header[256];
freadCheck(model_header, sizeof(int), 256, model_file);
freadCheck(model_header, sizeof(int), 256, model_file);
if (model_header[0] != 20240326) { printf("Bad magic model file\n"); exit(EXIT_FAILURE); }
if (model_header[0] != 20240326) { printf("Bad magic model file\n"); exit(EXIT_FAILURE); }
int version = model_header[1];
int version = model_header[1];
if (!(version == 3 || version == 5)) {
if (!(version == 3 || version == 5)) {
// 3 = fp32, padded vocab
// 3 = fp32, padded vocab
// 5 = bf16, padded vocab, layernorms also in bf16
// 5 = bf16, padded vocab, layernorms also in bf16
fprintf(stderr, "Bad version in model file\n");
fprintf(stderr, "Bad version in model file\n");
fprintf(stderr, "---> HINT: try to re-run `python train_GPT.py`\n");
fprintf(stderr, "---> HINT: try to re-run `python train_GPT.py`\n");
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
}
// check if the precision mode of the checkpoing matches the model precision
// check if the precision mode of the checkpoing matches the model precision
if (weight_init) {
if (weight_init) {
if (PRECISION_MODE == PRECISION_BF16 && version != 5) {
if (PRECISION_MODE == PRECISION_BF16 && version != 5) {
fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path);
fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path);
fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n");
fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n");
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
}
if (PRECISION_MODE == PRECISION_FP32 && version != 3) {
if (PRECISION_MODE == PRECISION_FP32 && version != 3) {
fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path);
fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path);
fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_GPTcu PRECISION=FP32`\n");
fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_GPTcu PRECISION=FP32`\n");
fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n");
fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n");
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
}
}
}
// read in hyperparameters
// read in hyperparameters
model->config.max_seq_len = model_header[2];
model->config.max_seq_len = model_header[2];
model->config.vocab_size = model_header[3];
model->config.vocab_size = model_header[3];
model->config.num_layers = model_header[4];
model->config.num_layers = model_header[4];
model->config.num_heads = model_header[5];
model->config.num_heads = model_header[5];
model->config.channels = model_header[6];
model->config.channels = model_header[6];
model->config.padded_vocab_size = model_header[7];
model->config.padded_vocab_size = model_header[7];
// allocate memory for the model parameters
// allocate memory for the model parameters
GPT_allocate_weights(model);
GPT_allocate_weights(model);
// read in the parameters if weight_init is true
// read in the parameters if weight_init is true
if (weight_init) {
if (weight_init) {
assert(model->params_memory != NULL);
assert(model->params_memory != NULL);
file_to_device(model->params_memory, model_file, model->num_parameters_bytes, IO_BUF_SIZE, main_stream);
file_to_device(model->params_memory, model_file, model->num_parameters_bytes, IO_BUF_SIZE, main_stream);
}
}
fcloseCheck(model_file);
fcloseCheck(model_file);
// only return from this function once we are certain the params are ready on the GPU
// only return from this function once we are certain the params are ready on the GPU
cudaCheck(cudaDeviceSynchronize());
cudaCheck(cudaDeviceSynchronize());
}
}
void GPT_set_hyperparameters(GPTConfig* config, int option) {
void GPT_set_hyperparameters(GPTConfig* config, int option) {
int channels, num_heads, num_layers;
int channels, num_heads, num_layers;
if (option == 1) { num_layers = 6; channels = 384; num_heads = 6; } // (unofficial) GPT-tiny (30M)
if (option == 1) { num_layers = 6; channels = 384; num_heads = 6; } // (unofficial) GPT-tiny (30M)
else if (option == 2) { num_layers = 12; channels = 768; num_heads = 12; } // GPT (124M)
else if (option == 2) { num_layers = 12; channels = 768; num_heads = 12; } // GPT (124M)
else { fprintf(stderr, "Unsupported GPT-2 depth: %d\n", num_layers); exit(EXIT_FAILURE); }
else { fprintf(stderr, "Unsupported GPT-2 depth: %d\n", num_layers); exit(EXIT_FAILURE); }
config->max_seq_len = 1024;
config->max_seq_len = 1024;
config->vocab_size = 50257;
config->vocab_size = 50257;
config->num_layers = num_layers;
config->num_layers = num_layers;
config->num_heads = num_heads;
config->num_heads = num_heads;
config->channels = channels;
config->channels = channels;
config->padded_vocab_size = 50304;
config->padded_vocab_size = 50304;
}
}
void gpt_innit_weights(GPT *model) {
void gpt_innit_weights(GPT *model) {
// allocate and random init the memory for all the parameters with GPT-2 schema
// allocate and random init the memory for all the parameters with GPT-2 schema
// weights ~N(0, 0.02), biases 0, c_proj weights ~N(0, 0.02/(2*L)**0.5)
// weights ~N(0, 0.02), biases 0, c_proj weights ~N(0, 0.02/(2*L)**0.5)
// NOTE: assuming all parameters are of the type floatX, could be relaxed later
mt19937_state init_rng;
manual_seed(&init_rng, 42);
floatX* params_memory_cpu = (floatX*)mallocCheck(model->num_parameters_bytes);
memset(params_memory_cpu, 0, model->num_parameters_bytes);
// fill in all the weights with random