# !pip install torch numpy matplotlib

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

torch.manual_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('device:', device)

import urllib.request
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
urllib.request.urlretrieve(url, 'tinyshakespeare.txt')
text = open('tinyshakespeare.txt').read()
print(f'corpus: {len(text)} chars, {len(set(text))} unique')

chars = sorted(set(text))
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for c, i in stoi.items()}
data = torch.tensor([stoi[c] for c in text], dtype=torch.long)
n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]
print('vocab:', len(chars), 'train:', len(train_data), 'val:', len(val_data))

# YOUR TURN
# class MultiHeadAttention(nn.Module): ...
# class TransformerBlock(nn.Module): ...
# class MiniTransformer(nn.Module): ...
# 4 layers, 4 heads, d_model=256, context length=128. ~12M parameters.

# YOUR TURN
# AdamW, lr=3e-4. Train for ~5000 iterations. Log train/val loss every 200 iter.

# YOUR TURN
# After training, generate 200 chars starting from 'ROMEO:' at
# temperature in {0.5, 0.8, 1.0}. Compare diversity vs coherence.

Lab 1 — Mini transformer from scratch¶

Setup¶

TinyShakespeare¶

Exercise 1 — Character-level tokenizer¶

Exercise 2 — Implement the transformer block¶

Exercise 3 — Training loop with eval¶

Exercise 4 — Generate samples at three temperatures¶

Done?¶