feat: add code for finetuning moondream

2024-12-08 17:33:37 +00:00
parent 4bf2c89bf8
commit 6b53cb0411
11 changed files with 261 additions and 0 deletions
--- a/resnet/resnet.py
+++ b/resnet/resnet.py
@@ -0,0 +1,193 @@
+import torch
+import torch.nn as nn
+
+
+# "the convolutional layers mostly have 3×3 filters and follow two simple design rules: ..."
+# He et al., ‘Deep Residual Learning for Image Recognition’
+RESNET_KERNEL_SIZE = 3
+
+
+# used to match dimensions of input to output, done by a 1x1 convolution
+# He et al., ‘Deep Residual Learning for Image Recognition’ page 4
+def projection_shortcut(in_channels, out_channels):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            # "when the shortcuts go across feature maps of two sizes, they are performed with a stride of 2"
+            # He et al., ‘Deep Residual Learning for Image Recognition’.
+            stride=2,
+            kernel_size=1,
+        ),
+        nn.BatchNorm2d(out_channels),
+    )
+
+
+class ResidualBlock(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, stride=1, shortcut=None, *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.conv0 = nn.Sequential(
+            nn.Conv2d(
+                in_channels, out_channels, kernel_size=3, stride=stride, padding=1
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(out_channels),
+        )
+        self.relu = nn.ReLU()
+        self.out_channels = out_channels
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        residual = x
+        out = self.conv0(x)
+        out = self.conv1(out)
+        if self.shortcut:
+            out += self.shortcut(residual)
+        else:
+            out += residual
+        out = self.relu(out)
+        return out
+
+
+# MAI in ResNet with 34 layers
+# He et al., ‘Deep Residual Learning for Image Recognition’.
+class MaiRes(nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+        # first 7x7 conv layer
+        self.conv = nn.Conv2d(
+            in_channels=3,
+            out_channels=64,
+            stride=2,
+            padding=3,
+            kernel_size=RESNET_KERNEL_SIZE,
+        )
+        self.maxpool = nn.MaxPool2d(kernel_size=RESNET_KERNEL_SIZE, stride=2)
+
+        # layers are named after the colors used for each group
+        # in the diagram presented in the ResNet paper
+
+        # 3 residual blocks for a total of 6 layers
+        self.layer_purple = nn.Sequential(
+            ResidualBlock(
+                in_channels=64,
+                out_channels=64,
+                stride=1,
+            ),
+            ResidualBlock(
+                in_channels=64,
+                out_channels=64,
+                stride=1,
+            ),
+            ResidualBlock(
+                in_channels=64,
+                out_channels=64,
+                stride=1,
+            ),
+        )
+
+        # 4 residual blocks for a total of 8 layers
+        self.layer_green = nn.Sequential(
+            ResidualBlock(
+                in_channels=64,
+                out_channels=128,
+                stride=2,
+                shortcut=projection_shortcut(in_channels=64, out_channels=128),
+            ),
+            ResidualBlock(
+                in_channels=128,
+                out_channels=128,
+                stride=1,
+            ),
+            ResidualBlock(
+                in_channels=128,
+                out_channels=128,
+                stride=1,
+            ),
+            ResidualBlock(
+                in_channels=128,
+                out_channels=128,
+                stride=1,
+            ),
+        )
+
+        # 6 residual blocks for a total of 12 layers
+        self.layer_red = nn.Sequential(
+            ResidualBlock(
+                in_channels=128,
+                out_channels=256,
+                stride=2,
+                shortcut=projection_shortcut(in_channels=128, out_channels=256),
+            ),
+            ResidualBlock(
+                in_channels=256,
+                out_channels=256,
+                stride=1,
+            ),
+            ResidualBlock(
+                in_channels=256,
+                out_channels=256,
+                stride=1,
+            ),
+            ResidualBlock(
+                in_channels=256,
+                out_channels=256,
+                stride=1,
+            ),
+            ResidualBlock(
+                in_channels=256,
+                out_channels=256,
+                stride=1,
+            ),
+            ResidualBlock(
+                in_channels=256,
+                out_channels=256,
+                stride=1,
+            ),
+        )
+
+        # 3 residual blocks for a total of 6 layers
+        self.layer_blue = nn.Sequential(
+            ResidualBlock(
+                in_channels=256,
+                out_channels=512,
+                stride=2,
+                shortcut=projection_shortcut(in_channels=256, out_channels=512),
+            ),
+            ResidualBlock(
+                in_channels=512,
+                out_channels=512,
+                stride=1,
+            ),
+            ResidualBlock(
+                in_channels=512,
+                out_channels=512,
+                stride=1,
+            ),
+        )
+
+        self.avgpool = nn.AvgPool2d(kernel_size=RESNET_KERNEL_SIZE)
+        self.fc = nn.Linear(in_features=2048, out_features=1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.maxpool(x)
+
+        x = self.layer_purple(x)
+        x = self.layer_green(x)
+        x = self.layer_red(x)
+        x = self.layer_blue(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x