JAX vs. PyTorch

Implementation Example

// JAX vs. PyTorch: Same model, different approaches

// PyTorch: Imperative, Pythonic approach
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(784, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )
    
    def forward(self, x):
        # Easy to add complex logic, print statements, debugging
        if self.training:
            print(f"Training mode, input shape: {x.shape}")
        return self.layers(x)

model = MLP()
optimizer = torch.optim.Adam(model.parameters())

# Training loop (imperative, easy to debug)
for epoch in range(10):
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = nn.CrossEntropyLoss()(output, batch_y)
        loss.backward()
        optimizer.step()

// JAX: Functional, compilable approach
import jax
import jax.numpy as jnp
from flax import linen as nn

class MLP(nn.Module):
    @nn.compact
    def __call__(self, x):
        x = nn.Dense(256)(x)
        x = nn.relu(x)
        x = nn.Dense(128)(x)
        x = nn.relu(x)
        return nn.Dense(10)(x)

model = MLP()
params = model.init(jax.random.PRNGKey(0), jnp.ones((1, 784)))

# Compiled, optimized training step
@jax.jit
def train_step(params, x, y):
    def loss_fn(params):
        logits = model.apply(params, x)
        loss = optax.softmax_cross_entropy_with_integer_labels(logits, y)
        return loss.mean()
    
    loss, grads = jax.value_and_grad(loss_fn)(params)
    return loss, grads

# Much faster, but requires functional discipline
for epoch in range(10):
    for batch_x, batch_y in dataloader:
        loss, grads = train_step(params, batch_x, batch_y)
        params = optax.apply_gradients(grads)

Intent & Description

🎯 Intent

📋 Context

💡 Solution

Real-world Use Case

📌 TL;DR

Advantages

Disadvantages