MOONDREAM_REVISION = "2025-01-09" TEST_SIZE = 0.2 # Number of times to repeat the training dataset. Increasing this may cause the model to overfit or # lose generalization due to catastrophic forgetting. Decreasing it may cause the model to underfit. EPOCHS = 2 # Number of samples to process in each batch. Set this to the highest value that doesn't cause an # out-of-memory error. Decrease it if you're running out of memory. BATCH_SIZE = 8 # Number of batches to process before updating the model. You can use this to simulate a higher batch # size than your GPU can handle. Set this to 1 to disable gradient accumulation. GRAD_ACCUM_STEPS = 1 # Learning rate for the Adam optimizer. Needs to be tuned on a case-by-case basis. As a general rule # of thumb, increase it by 1.4 times each time you double the effective batch size. # # Source: https://www.cs.princeton.edu/~smalladi/blog/2024/01/22/SDEs-ScalingRules/ # # Note that we linearly warm the learning rate up from 0.1 * LR to LR over the first 10% of the # training run, and then decay it back to 0.1 * LR over the last 90% of the training run using a # cosine schedule. LR = 1e-5 # Whether to use Weights and Biases for logging training metrics. USE_WANDB = False ANSWER_EOS = "<|endoftext|>" # Number of tokens used to represent each image. IMG_TOKENS = 729