Setting Up Video Classification on Ubuntu using PyTorch

This guide explains how to set up a video classification system on Ubuntu using PyTorch. We’ll use a simple model structure that applies a 3D Convolutional Neural Network (3D CNN) to classify video data.

1. Install System Prerequisites

Update your Ubuntu system and install necessary dependencies. Open a terminal and run the following commands:

sudo apt update
sudo apt upgrade
sudo apt install python3 python3-pip ffmpeg
    

2. Install PyTorch and Additional Libraries

Install PyTorch and other libraries needed for video processing:

pip install torch torchvision opencv-python scikit-learn
    

3. Prepare Video Dataset

Create a dataset directory structure with videos grouped by class. Example:

dataset/
├── class1/
│   ├── video1.mp4
│   └── video2.mp4
└── class2/
    ├── video3.mp4
    └── video4.mp4
    

4. Load and Process Video Data

The following code loads videos, resizes them, extracts frames, and prepares them for the 3D CNN model.

import cv2
import os
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

# Video dataset class
class VideoDataset(Dataset):
    def __init__(self, video_dir, transform=None, frames_per_clip=16):
        self.video_dir = video_dir
        self.transform = transform
        self.frames_per_clip = frames_per_clip
        self.classes = os.listdir(video_dir)
        self.video_paths = [(os.path.join(self.video_dir, cls, vid), cls) for cls in self.classes for vid in os.listdir(os.path.join(self.video_dir, cls))]
        self.label_map = {cls: idx for idx, cls in enumerate(self.classes)}
    
    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, idx):
        video_path, label = self.video_paths[idx]
        frames = self.load_frames(video_path)
        if self.transform:
            frames = [self.transform(frame) for frame in frames]
        return torch.stack(frames), self.label_map[label]
    
    def load_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        while len(frames) < self.frames_per_clip:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, (112, 112))
            frames.append(frame)
        cap.release()
        frames += [frames[-1]] * (self.frames_per_clip - len(frames))  # Pad if not enough frames
        return frames[:self.frames_per_clip]

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

dataset = VideoDataset(video_dir="dataset", transform=transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
    

5. Define the 3D CNN Model for Video Classification

Define a simple 3D CNN model for video classification. The 3D CNN will process each frame sequence as a 3D input.

import torch.nn as nn

class VideoClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(VideoClassificationModel, self).__init__()
        self.conv3d = nn.Sequential(
            nn.Conv3d(3, 64, kernel_size=(3, 3, 3), stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)),
            nn.Conv3d(64, 128, kernel_size=(3, 3, 3), stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
        )
        self.fc = nn.Sequential(
            nn.Linear(128 * 4 * 4 * 4, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.conv3d(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

num_classes = len(dataset.classes)
model = VideoClassificationModel(num_classes)
    

6. Training the Model

The following script trains the model on the loaded dataset using cross-entropy loss and the Adam optimizer.

import torch.optim as optim

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in dataloader:
        inputs, labels = inputs.permute(0, 2, 1, 3, 4).float(), labels  # Rearrange to [B, C, T, H, W]
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader):.4f}")
    

7. Testing the Model

After training, you can test the model on individual videos or batches to evaluate its classification performance.

# Sample test loop
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in dataloader:
        inputs, labels = inputs.permute(0, 2, 1, 3, 4).float(), labels
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Accuracy of the model: {100 * correct / total}%")
    

8. Running the Script

Save the code into a file called video_classification.py and run it in the terminal:

python3 video_classification.py
    

The script will train the video classification model and output the accuracy.

9. Troubleshooting

If you encounter issues, check the following:

  • Ensure all required libraries are installed.
  • Check the dataset directory structure and video file formats.
  • Verify that videos are read and resized correctly in the VideoDataset class.

10. Conclusion

You have successfully set up a video classification system on Ubuntu using PyTorch. This system can be further enhanced by using more complex models and larger datasets for improved accuracy.