GPU vs. TPU vs. CPU for AI Workloads

Implementation Example

# Hardware Selection for AI Workloads

class HardwareSelector:
    def __init__(self):
        self.hardware_specs = {
            'A100': {
                'flops_fp16': 312,
                'memory_bandwidth': '2 TB/s',
                'memory': '80GB',
                'cost_per_flop': 'high',
                'programmability': 'high',
                'framework_support': 'excellent'
            },
            'H100': {
                'flops_fp16': 989,
                'memory_bandwidth': '3.35 TB/s',
                'memory': '80GB',
                'cost_per_flop': 'very_high',
                'programmability': 'high',
                'framework_support': 'excellent'
            },
            'TPU-v4': {
                'flops_fp16': 275,
                'memory_bandwidth': '1.2 TB/s',
                'memory': '128GB',
                'cost_per_flop': 'medium',
                'programmability': 'medium',
                'framework_support': 'good'
            },
            'CPU': {
                'flops_fp16': 1,
                'memory_bandwidth': '50 GB/s',
                'memory': 'variable',
                'cost_per_flop': 'low',
                'programmability': 'very_high',
                'framework_support': 'excellent'
            }
        }

    def select_hardware(self, workload):
        model_size = workload.get('model_size', 'unknown')
        framework = workload.get('framework', 'pytorch')
        scale = workload.get('scale', 'small')
        latency_sensitivity = workload.get('latency_sensitivity', 'medium')

        # Decision logic
        if framework == 'jax' and scale == 'large':
            return 'TPU-v4'
        elif model_size in ['7B', '13B', '33B']:
            return 'A100'
        elif model_size in ['70B', '175B'] and latency_sensitivity == 'high':
            return 'H100'
        elif model_size in ['BERT-base', 'distilled']:
            return 'CPU'
        elif scale == 'hyperscale' and latency_sensitivity == 'low':
            return 'custom_asic'
        else:
            return 'A100'  # Default choice

    def estimate_cost(self, hardware, hours):
        costs = {
            'A100': 3.0,  # $3.0/hour (simplified)
            'H100': 9.0,  # $9.0/hour
            'TPU-v4': 2.0,  # $2.0/hour
            'CPU': 0.1,   # $0.1/hour
        }
        return costs.get(hardware, 0) * hours

# Usage example
selector = HardwareSelector()

# Scenario 1: LLM training with PyTorch
llm_workload = {
    'model_size': '13B',
    'framework': 'pytorch',
    'scale': 'medium',
    'latency_sensitivity': 'low'
}
hardware = selector.select_hardware(llm_workload)  # Returns 'A100'

# Scenario 2: JAX-based large-scale training
jax_workload = {
    'model_size': '175B',
    'framework': 'jax',
    'scale': 'large',
    'latency_sensitivity': 'medium'
}
hardware = selector.select_hardware(jax_workload)  # Returns 'TPU-v4'

# Scenario 3: Small model inference
small_model_workload = {
    'model_size': 'BERT-base',
    'framework': 'pytorch',
    'scale': 'small',
    'latency_sensitivity': 'low'
}
hardware = selector.select_hardware(small_model_workload)  # Returns 'CPU'

GPU vs. TPU vs. CPU for AI Workloads

Intent & Description

🎯 Intent

📋 Context

💡 Solution

Real-world Use Case

📌 TL;DR

Advantages

Disadvantages