34 lines
1.3 KiB
Python
34 lines
1.3 KiB
Python
MOONDREAM_REVISION = "2024-08-26"
|
|
|
|
TEST_SIZE = 0.2
|
|
|
|
# Number of times to repeat the training dataset. Increasing this may cause the model to overfit or
|
|
# lose generalization due to catastrophic forgetting. Decreasing it may cause the model to underfit.
|
|
EPOCHS = 2
|
|
|
|
# Number of samples to process in each batch. Set this to the highest value that doesn't cause an
|
|
# out-of-memory error. Decrease it if you're running out of memory.
|
|
BATCH_SIZE = 8
|
|
|
|
# Number of batches to process before updating the model. You can use this to simulate a higher batch
|
|
# size than your GPU can handle. Set this to 1 to disable gradient accumulation.
|
|
GRAD_ACCUM_STEPS = 1
|
|
|
|
# Learning rate for the Adam optimizer. Needs to be tuned on a case-by-case basis. As a general rule
|
|
# of thumb, increase it by 1.4 times each time you double the effective batch size.
|
|
#
|
|
# Source: https://www.cs.princeton.edu/~smalladi/blog/2024/01/22/SDEs-ScalingRules/
|
|
#
|
|
# Note that we linearly warm the learning rate up from 0.1 * LR to LR over the first 10% of the
|
|
# training run, and then decay it back to 0.1 * LR over the last 90% of the training run using a
|
|
# cosine schedule.
|
|
LR = 1e-5
|
|
|
|
# Whether to use Weights and Biases for logging training metrics.
|
|
USE_WANDB = False
|
|
|
|
ANSWER_EOS = "<|endoftext|>"
|
|
|
|
# Number of tokens used to represent each image.
|
|
IMG_TOKENS = 729
|