diff --git a/moondream/siglip.py b/moondream/siglip.py
new file mode 100644
index 0000000..a5e017b
--- /dev/null
+++ b/moondream/siglip.py
@@ -0,0 +1,41 @@
+import transformers
+import torch
+import datasets
+import sklearn
+
+device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
+model = transformers.AutoModel.from_pretrained("google/siglip-base-patch16-224").to(device)
+processor = transformers.AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+nn = sklearn.neighbors.NearestNeighbors(metric="euclidean", radius=1.0)
+
+ds = datasets.load_dataset("ehristoforu/midjourney-images", split="train", trust_remote_code=True, streaming=True)\
+                               .select_columns(["image"])\
+                               .map(lambda row: {
+                                   **row,
+                                   "qa": {
+                                       "question": "Describe this image.",
+                                       "answer": "This is an AI image."
+                                   }
+                               })\
+                               .take(500)
+
+with torch.no_grad():
+    inputs = processor(images=[row["image"] for row in ds], return_tensors="pt").to(device)
+    image_features = model.get_image_features(**inputs).cpu()
+
+nn.fit(image_features)
+
+used_indices = set()
+unique_indices = []
+for i, row in enumerate(ds):
+    if i in used_indices:
+        continue
+
+    feature = image_features[i]
+
+    neighbors = nn.radius_neighbors([feature], radius=1.0, return_distance=False)[0]
+
+    unique_indices.append(i)
+    used_indices.update(neighbors)
+
+print(len(unique_indices))
diff --git a/moondream/test.py b/moondream/test.py
index 558b31a..5ad381b 100644
--- a/moondream/test.py
+++ b/moondream/test.py
@@ -18,45 +18,9 @@ moondream = transformers.AutoModelForCausalLM.from_pretrained(
     device_map={"": DEVICE},
 )
 
-diffusion_db_dataset = datasets.load_dataset("poloclub/diffusiondb", "2m_random_5k", trust_remote_code=True, split="train")\
-                               .shuffle()\
-                               .take(100)\
-                               .select_columns(["image"])\
-                               .map(lambda row: {
-                                   **row,
-                                   "qa": {
-                                       "question": "Describe this image.",
-                                       "answer": "This is an AI image."
-                                   }
-                               })
-
-flickr_dataset = datasets.load_dataset("nlphuji/flickr30k", split="test")\
-                         .shuffle()\
-                         .take(100)\
-                         .select_columns(["image"])\
-                         .map(lambda row: {
-                             **row,
-                             "qa": {
-                                 "question": "Describe this image.",
-                                 "answer": "This is a real image."
-                             }
-                         })
-
-midjourney_dataset = datasets.load_dataset("ehristoforu/midjourney-images", split="train", streaming=True)\
-                             .select_columns(["image"])\
-                             .map(lambda row: {
-                                 **row,
-                                 "qa": {
-                                     "question": "Describe this image.",
-                                     "answer": "This is an AI image."
-                                 }
-                             })
-
-dataset = datasets.concatenate_datasets([diffusion_db_dataset, flickr_dataset]).shuffle()
-
 pathlib.Path("./samples").mkdir(parents=True, exist_ok=True)
 
-img = Image.open("samples/frames_3.jpg")
+img = Image.open("samples/Untitled.jpg")
 md_answer = moondream.answer_question(
     moondream.encode_image(img),
     "Describe this image.",
@@ -68,28 +32,28 @@ md_answer = moondream.answer_question(
 
 print(md_answer)
 
-correct_predictions = 0
-for i, sample in enumerate(midjourney_dataset):
-    if i > 4:
-        break
+# correct_predictions = 0
+# for i, sample in enumerate(flickr_dataset):
+#     if i > 4:
+#         break
 
-    sample["image"].save(f"samples/{i}.png", "PNG")
+#     sample["image"].save(f"samples/{i}.png", "PNG")
 
-    md_answer = moondream.answer_question(
-        moondream.encode_image(sample['image']),
-        sample['qa']['question'],
-        tokenizer=tokenizer,
-        num_beams=4,
-        no_repeat_ngram_size=5,
-        early_stopping=True
-    )
+#     md_answer = moondream.answer_question(
+#         moondream.encode_image(sample['image']),
+#         sample['qa']['question'],
+#         tokenizer=tokenizer,
+#         num_beams=4,
+#         no_repeat_ngram_size=5,
+#         early_stopping=True
+#     )
 
-    print(f"Question: {sample['qa']['question']}")
-    print(f"Ground truth: {sample['qa']['answer']}")
-    print(f"Moondream: {md_answer}")
-    print()
+#     print(f"Question: {sample['qa']['question']}")
+#     print(f"Ground truth: {sample['qa']['answer']}")
+#     print(f"Moondream: {md_answer}")
+#     print()
 
-    if md_answer.lower() == sample['qa']['answer'].lower():
-        correct_predictions += 1
+#     if md_answer.lower() == sample['qa']['answer'].lower():
+#         correct_predictions += 1
 
-print(f"Accuracy: {correct_predictions * 100 / 10}%")
+# print(f"Accuracy: {correct_predictions * 100 / 10}%")
diff --git a/moondream/train.py b/moondream/train.py
index 7194cd4..e2289c9 100644
--- a/moondream/train.py
+++ b/moondream/train.py
@@ -4,58 +4,114 @@ import datasets
 import transformers
 import bitsandbytes
 import pathlib
+import io
+import PIL
+import utils.datasets
 from tqdm import tqdm
 from .hyperparams import TEST_SIZE, ANSWER_EOS, IMG_TOKENS, LR, BATCH_SIZE, EPOCHS, GRAD_ACCUM_STEPS
 
 DEVICE = "cuda"
 DTYPE = torch.float32 if DEVICE == "cpu" else torch.float16 # CPU doesn't support float16
 MD_REVISION = "2024-07-23"
+TOTAL_DATA_SIZE = 8000
 
-diffusion_db_dataset = datasets.load_dataset("poloclub/diffusiondb", "2m_random_5k", trust_remote_code=True, split="train")\
+diffusion_db_dataset = datasets.load_dataset("poloclub/diffusiondb", "2m_random_5k", split="train", trust_remote_code=True, streaming=True)\
                                .select_columns(["image"])\
                                .map(lambda row: {
                                    **row,
                                    "qa": {
-                                       "question": "Describe this image.",
-                                       "answer": "This is an AI image."
+                                       "question": "Is this image AI generated?",
+                                       "answer": "Yes."
                                    }
-                               })\
-                               .train_test_split(test_size=TEST_SIZE)
+                               })
+diffusion_db_dataset = utils.datasets.split_streaming_dataset(diffusion_db_dataset, total_size=2000, test_size=TEST_SIZE)
 
-flickr_dataset = datasets.load_dataset("nlphuji/flickr30k", split="test")\
-                         .take(2500)\
+midjourney_dataset = datasets.load_dataset("brivangl/midjourney-v6-llava", split="train", streaming=True)\
+                             .select_columns(["image"])\
+                             .map(lambda row: {
+                                 **row,
+                                 "qa": {
+                                     "question": "Is this image AI generated?",
+                                     "answer": "Yes."
+                                 }
+                             })
+midjourney_dataset = utils.datasets.split_streaming_dataset(midjourney_dataset, total_size=2000, test_size=TEST_SIZE)
+
+flickr_dataset = datasets.load_dataset("nlphuji/flickr30k", split="test", streaming=True)\
                          .select_columns(["image"])\
                          .map(lambda row: {
                              **row,
                              "qa": {
-                                 "question": "Describe this image.",
-                                 "answer": "This is a real image."
+                                 "question": "Is this image AI generated?",
+                                 "answer": "No."
                              }
-                         })\
-                         .train_test_split(test_size=TEST_SIZE)
+                         })
+flickr_dataset = utils.datasets.split_streaming_dataset(flickr_dataset, total_size=800, test_size=TEST_SIZE)
 
-wiki_art_dataset = datasets.load_dataset("huggan/wikiart", split="train")\
-                           .take(2500)\
+wiki_art_dataset = datasets.load_dataset("huggan/wikiart", split="train", streaming=True)\
                            .select_columns(["image"])\
                            .map(lambda row: {
                                **row,
                                "qa": {
-                                   "question": "Describe thie image.",
-                                   "answer": "This is a real image."
+                                   "question": "Is this image AI generated?",
+                                   "answer": "No."
                                }
-                           })\
-                           .train_test_split(test_size=TEST_SIZE)
+                           })
+wiki_art_dataset = utils.datasets.split_streaming_dataset(wiki_art_dataset, total_size=800, test_size=TEST_SIZE)
 
-training_dataset = datasets.concatenate_datasets([
+anime_dataset = datasets.load_dataset("animelover/danbooru2022", "1-full", trust_remote_code=True, split="train", streaming=True)\
+                        .select_columns(["image"])\
+                        .map(lambda row: {
+                            **row,
+                            "qa": {
+                                "question": "Is this image AI generated?",
+                                "answer": "No."
+                            }
+                        })
+anime_dataset = utils.datasets.split_streaming_dataset(anime_dataset, total_size=800, test_size=TEST_SIZE)
+
+coco_dataset = datasets.load_dataset("detection-datasets/coco", split="train", streaming=True)\
+                       .select_columns(["image"])\
+                       .map(lambda row: {
+                           **row,
+                           "qa": {
+                               "question": "Is this image AI generated?",
+                               "answer": "No."
+                           }
+                       })
+coco_dataset = utils.datasets.split_streaming_dataset(coco_dataset, total_size=800, test_size=TEST_SIZE)
+
+movie_poster_dataset = datasets.load_dataset("skvarre/movie_posters-100k", split="train", streaming=True)\
+                               .select_columns(["age"])\
+                               .map(lambda row: {
+                                   **row,
+                                   "qa": {
+                                       "question": "Is this image AI generated?",
+                                       "answer": "No."
+                                   }
+                               })
+movie_poster_dataset = utils.datasets.split_streaming_dataset(movie_poster_dataset, total_size=800, test_size=TEST_SIZE)
+
+training_dataset = datasets.interleave_datasets([
     diffusion_db_dataset["train"],
+    midjourney_dataset["train"],
     flickr_dataset["train"],
     wiki_art_dataset["train"],
-]).shuffle()
-test_dataset = datasets.concatenate_datasets([
+    anime_dataset["train"],
+    coco_dataset["train"],
+    movie_poster_dataset["train"],
+], stopping_strategy="all_exhausted").cast_column("image", datasets.Image(decode=True))
+test_dataset = datasets.interleave_datasets([
     diffusion_db_dataset["test"],
+    midjourney_dataset["test"],
     flickr_dataset["test"],
     wiki_art_dataset["test"],
-]).shuffle()
+    anime_dataset["test"],
+    coco_dataset["test"],
+    movie_poster_dataset["test"],
+], stopping_strategy="all_exhausted").cast_column("image", datasets.Image(decode=True))
+
+print("Training and test dataset prepared.")
 
 tokenizer = transformers.AutoTokenizer.from_pretrained("vikhyatk/moondream2")
 moondream = transformers.AutoModelForCausalLM.from_pretrained(
@@ -150,7 +206,6 @@ dataloaders = {
     "train": torch.utils.data.DataLoader(
         training_dataset,
         batch_size=BATCH_SIZE,
-        shuffle=True,
         collate_fn=collate,
     ),
 }
@@ -158,7 +213,7 @@ dataloaders = {
 moondream.text_model.train()
 moondream.text_model.transformer.gradient_checkpointing_enable()
 
-total_steps = EPOCHS * len(dataloaders["train"]) // GRAD_ACCUM_STEPS
+total_steps = EPOCHS * (TOTAL_DATA_SIZE * (1 - TEST_SIZE)) // GRAD_ACCUM_STEPS
 optimizer = bitsandbytes.optim.Adam8bit(
     [{"params": moondream.text_model.parameters()}],
     lr=LR*0.1,
@@ -184,6 +239,7 @@ for epoch in range(EPOCHS):
 
 moondream.save_pretrained("checkpoints/moondream-mai")
 
+moondream.eval()
 pathlib.Path("./samples").mkdir(parents=True, exist_ok=True)
 
 correct_predictions = 0
@@ -201,9 +257,6 @@ for sample in tqdm(test_dataset, desc="Validation"):
     if md_answer == ground_truth:
         correct_predictions += 1
 
-    if i % 10 == 0:
-        print(f"Question: f{sample["qa"]["answer"]")
-
-accuracy = correct_predictions * 100 / len(test_dataset)
+accuracy = correct_predictions * 100 / (TOTAL_DATA_SIZE * TEST_SIZE)
 
 print(f"Model accuracy: f{accuracy}%")
diff --git a/utils/datasets.py b/utils/datasets.py
new file mode 100644
index 0000000..58c4197
--- /dev/null
+++ b/utils/datasets.py
@@ -0,0 +1,8 @@
+import datasets
+
+def split_streaming_dataset(ds: datasets.IterableDataset, total_size: int, test_size: float) -> dict[str, datasets.IterableDataset]:
+    size = round(total_size * (1 - test_size))
+    return {
+        "train": ds.take(size),
+        "test": ds.skip(size).take(total_size - size),
+    }