Setting Up Video Classification on Ubuntu using PyTorch
This guide explains how to set up a video classification system on Ubuntu using PyTorch. We’ll use a simple model structure that applies a 3D Convolutional Neural Network (3D CNN) to classify video data.
1. Install System Prerequisites
Update your Ubuntu system and install necessary dependencies. Open a terminal and run the following commands:
sudo apt update
sudo apt upgrade
sudo apt install python3 python3-pip ffmpeg
2. Install PyTorch and Additional Libraries
Install PyTorch and other libraries needed for video processing:
pip install torch torchvision opencv-python scikit-learn
3. Prepare Video Dataset
Create a dataset directory structure with videos grouped by class. Example:
dataset/
├── class1/
│ ├── video1.mp4
│ └── video2.mp4
└── class2/
├── video3.mp4
└── video4.mp4
4. Load and Process Video Data
The following code loads videos, resizes them, extracts frames, and prepares them for the 3D CNN model.
import cv2
import os
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
# Video dataset class
class VideoDataset(Dataset):
def __init__(self, video_dir, transform=None, frames_per_clip=16):
self.video_dir = video_dir
self.transform = transform
self.frames_per_clip = frames_per_clip
self.classes = os.listdir(video_dir)
self.video_paths = [(os.path.join(self.video_dir, cls, vid), cls) for cls in self.classes for vid in os.listdir(os.path.join(self.video_dir, cls))]
self.label_map = {cls: idx for idx, cls in enumerate(self.classes)}
def __len__(self):
return len(self.video_paths)
def __getitem__(self, idx):
video_path, label = self.video_paths[idx]
frames = self.load_frames(video_path)
if self.transform:
frames = [self.transform(frame) for frame in frames]
return torch.stack(frames), self.label_map[label]
def load_frames(self, video_path):
cap = cv2.VideoCapture(video_path)
frames = []
while len(frames) < self.frames_per_clip:
ret, frame = cap.read()
if not ret:
break
frame = cv2.resize(frame, (112, 112))
frames.append(frame)
cap.release()
frames += [frames[-1]] * (self.frames_per_clip - len(frames)) # Pad if not enough frames
return frames[:self.frames_per_clip]
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
dataset = VideoDataset(video_dir="dataset", transform=transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
5. Define the 3D CNN Model for Video Classification
Define a simple 3D CNN model for video classification. The 3D CNN will process each frame sequence as a 3D input.
import torch.nn as nn
class VideoClassificationModel(nn.Module):
def __init__(self, num_classes):
super(VideoClassificationModel, self).__init__()
self.conv3d = nn.Sequential(
nn.Conv3d(3, 64, kernel_size=(3, 3, 3), stride=1, padding=1),
nn.ReLU(),
nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)),
nn.Conv3d(64, 128, kernel_size=(3, 3, 3), stride=1, padding=1),
nn.ReLU(),
nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
)
self.fc = nn.Sequential(
nn.Linear(128 * 4 * 4 * 4, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.conv3d(x)
x = x.view(x.size(0), -1)
return self.fc(x)
num_classes = len(dataset.classes)
model = VideoClassificationModel(num_classes)
6. Training the Model
The following script trains the model on the loaded dataset using cross-entropy loss and the Adam optimizer.
import torch.optim as optim
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for inputs, labels in dataloader:
inputs, labels = inputs.permute(0, 2, 1, 3, 4).float(), labels # Rearrange to [B, C, T, H, W]
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader):.4f}")
7. Testing the Model
After training, you can test the model on individual videos or batches to evaluate its classification performance.
# Sample test loop
model.eval()
with torch.no_grad():
correct = 0
total = 0
for inputs, labels in dataloader:
inputs, labels = inputs.permute(0, 2, 1, 3, 4).float(), labels
outputs = model(inputs)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print(f"Accuracy of the model: {100 * correct / total}%")
8. Running the Script
Save the code into a file called video_classification.py
and run it in the terminal:
python3 video_classification.py
The script will train the video classification model and output the accuracy.
9. Troubleshooting
If you encounter issues, check the following:
- Ensure all required libraries are installed.
- Check the dataset directory structure and video file formats.
- Verify that videos are read and resized correctly in the
VideoDataset
class.
10. Conclusion
You have successfully set up a video classification system on Ubuntu using PyTorch. This system can be further enhanced by using more complex models and larger datasets for improved accuracy.