ailearn

深度学习训练技巧 - 优化与正则化

掌握深度学习训练核心技巧,提升模型训练效果

访问-- -- --

前置知识:需要先掌握 神经网络基础

本文重点:训练技巧、优化算法、正则化方法


一、优化算法

1.1 梯度下降变体

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
"""
梯度下降算法演进:
1. SGD (Stochastic Gradient Descent)
   θ = θ - lr * ∇θ
   
2. Momentum
   v = β * v + ∇θ
   θ = θ - lr * v
   积累历史梯度,加速收敛
3. NAG (Nesterov Accelerated Gradient)
   先"预测"再计算梯度
   
4. AdaGrad
   自适应学习率,适合稀疏数据
   
5. RMSprop
   解决AdaGrad学习率衰减问题
   
6. Adam
   结合Momentum和RMSprop
   最常用的优化器
"""
# 创建示例模型
model = nn.Sequential(
    nn.Linear(10, 50),
    nn.ReLU(),
    nn.Linear(50, 1)
)
# 不同优化器对比
optimizers = {
    'SGD': optim.SGD(model.parameters(), lr=0.01),
    'SGD+Momentum': optim.SGD(model.parameters(), lr=0.01, momentum=0.9),
    'Adam': optim.Adam(model.parameters(), lr=0.001),
    'AdamW': optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01),
}
# Adam优化器参数详解
adam = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-8)
"""
Adam参数说明:
- lr: 学习率,默认0.001
- betas: (β1, β2) 动量参数,默认(0.9, 0.999)
- eps: 数值稳定性,默认1e-8
- weight_decay: L2正则化系数
工作原理:
m_t = β1 * m_{t-1} + (1-β1) * g_t     # 一阶矩估计
v_t = β2 * v_{t-1} + (1-β2) * g_t²    # 二阶矩估计
m̂_t = m_t / (1-β1^t)                   # 偏差修正
v̂_t = v_t / (1-β2^t)
θ_t = θ_{t-1} - lr * m̂_t / (√v̂_t + ε)
"""

1.2 学习率调度

"""
学习率调度策略:
1. StepLR: 每隔N个epoch降低学习率
2. ExponentialLR: 指数衰减
3. CosineAnnealingLR: 余弦退火
4. ReduceLROnPlateau: 指标停止改善时降低
5. OneCycleLR: 超收敛策略
6. Warmup: 预热学习率
"""
import torch.optim.lr_scheduler as scheduler
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.1)
# 1. 阶梯式衰减
step_scheduler = scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
# 每30个epoch,lr = lr * 0.1
# 2. 余弦退火(推荐)
cosine_scheduler = scheduler.CosineAnnealingLR(
    optimizer, 
    T_max=100,      # 周期
    eta_min=1e-6    # 最小学习率
)
# 3. 自适应调整
plateau_scheduler = scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',       # 监控指标越小越好
    factor=0.1,       # 衰减因子
    patience=10,      # 等待10个epoch
    verbose=True
)
# 4. 预热 + 余弦退火(大模型训练常用)
class WarmupCosineScheduler:
    """带预热的余弦退火调度器"""
    
    def __init__(self, optimizer, warmup_epochs, total_epochs, warmup_lr=1e-6, min_lr=1e-6):
        self.optimizer = optimizer
        self.warmup_epochs = warmup_epochs
        self.total_epochs = total_epochs
        self.warmup_lr = warmup_lr
        self.min_lr = min_lr
        self.base_lr = optimizer.param_groups[0]['lr']
        self.current_epoch = 0
    
    def step(self):
        if self.current_epoch < self.warmup_epochs:
            # 线性预热
            lr = self.warmup_lr + (self.base_lr - self.warmup_lr) * \
                 self.current_epoch / self.warmup_epochs
        else:
            # 余弦退火
            progress = (self.current_epoch - self.warmup_epochs) / \
                      (self.total_epochs - self.warmup_epochs)
            lr = self.min_lr + 0.5 * (self.base_lr - self.min_lr) * \
                 (1 + np.cos(np.pi * progress))
        
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        
        self.current_epoch += 1
        return lr
# 可视化学习率变化
def plot_learning_rate_schedule():
    epochs = 100
    warmup_epochs = 10
    
    optimizer = optim.SGD([torch.randn(2, 1, requires_grad=True)], lr=0.1)
    scheduler = WarmupCosineScheduler(optimizer, warmup_epochs, epochs)
    
    lrs = []
    for _ in range(epochs):
        lr = scheduler.step()
        lrs.append(lr)
    
    plt.figure(figsize=(10, 4))
    plt.plot(lrs)
    plt.xlabel('Epoch')
    plt.ylabel('Learning Rate')
    plt.title('Warmup + Cosine Annealing Learning Rate Schedule')
    plt.axvline(x=warmup_epochs, color='r', linestyle='--', label='Warmup End')
    plt.legend()
    plt.grid(True)
    plt.savefig('lr_schedule.png', dpi=100)
    plt.show()

二、正则化技术

2.1 L1/L2正则化

"""
L1正则化 (Lasso)
Loss = Loss_original + λ * |w|
- 产生稀疏权重
- 特征选择效果
L2正则化 (Ridge/Weight Decay)
Loss = Loss_original + λ * w²
- 防止权重过大
- 常用且稳定
"""
# PyTorch中的实现
model = nn.Linear(10, 1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)  # L2
# 手动实现L1正则化
def l1_regularization(model, lambda_l1):
    l1_loss = 0
    for param in model.parameters():
        l1_loss += torch.sum(torch.abs(param))
    return lambda_l1 * l1_loss
# 训练循环
def train_with_regularization(model, dataloader, optimizer, lambda_l1=0.01, lambda_l2=0.01):
    model.train()
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 添加L1正则化
        loss += l1_regularization(model, lambda_l1)
        # L2已在optimizer中通过weight_decay实现
        
        loss.backward()
        optimizer.step()

2.2 Dropout

class DropoutModel(nn.Module):
    """带Dropout的网络"""
    
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_prob=0.5):
        super().__init__()
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x
"""
Dropout要点:
1. 训练时随机丢弃神经元,测试时使用全部
2. 常用dropout概率:0.2-0.5
3. CNN中常用较低dropout或不用
4. 在全连接层之间使用
Dropout变体:
- SpatialDropout: 整个通道丢弃(CNN)
- DropConnect: 丢弃连接而非神经元
- DropBlock: 丢弃连续区域(CNN)
"""
class SpatialDropout(nn.Module):
    """空间Dropout,适用于CNN"""
    
    def __init__(self, drop_prob=0.2):
        super().__init__()
        self.drop_prob = drop_prob
    
    def forward(self, x):
        if not self.training or self.drop_prob == 0:
            return x
        
        # 只在通道维度dropout
        mask = torch.bernoulli(
            torch.ones(x.size(0), x.size(1), 1, 1, device=x.device) * (1 - self.drop_prob)
        )
        return x * mask / (1 - self.drop_prob)

2.3 Batch Normalization

class BNModel(nn.Module):
    """带BatchNorm的网络"""
    
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        
        x = self.fc3(x)
        return x
"""
Batch Normalization:
- 计算每个mini-batch的均值和方差
- 标准化后再缩放和平移
- 允许使用更大学习率
- 减少对初始化的依赖
BN vs LN vs IN vs GN:
┌─────────────────────────────────────┐
│ Batch Norm:    [N, C, H, W]         │  沿N维度归一化
│ Layer Norm:    [N, C, H, W]         │  沿C,H,W归一化
│ Instance Norm: [N, C, H, W]         │  沿H,W归一化
│ Group Norm:    [N, G, C//G, H, W]   │  分组归一化
└─────────────────────────────────────┘
选择建议:
- CNN: Batch Norm
- RNN/Transformer: Layer Norm
- 风格迁移: Instance Norm
- 小Batch训练: Group Norm
"""
# LayerNorm(Transformer常用)
class TransformerLayerNorm(nn.Module):
    def __init__(self, hidden_dim, eps=1e-6):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_dim))
        self.bias = nn.Parameter(torch.zeros(hidden_dim))
        self.eps = eps
    
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True, unbiased=False)
        return self.weight * (x - mean) / (std + self.eps) + self.bias

2.4 数据增强

import torchvision.transforms as transforms
# 图像数据增强
train_transform = transforms.Compose([
    # 几何变换
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    
    # 颜色变换
    transforms.ColorJitter(
        brightness=0.2,
        contrast=0.2,
        saturation=0.2,
        hue=0.1
    ),
    
    # 高级增强
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    
    # 转换和标准化
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])
# 使用Albumentations(更强大)
# pip install albumentations
try:
    import albumentations as A
    from albumentations.pytorch import ToTensorV2
    
    albumentations_transform = A.Compose([
        A.RandomCrop(224, 224),
        A.HorizontalFlip(p=0.5),
        A.Rotate(limit=15),
        A.OneOf([
            A.GaussNoise(),
            A.GaussianBlur(),
        ], p=0.3),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])
except ImportError:
    pass
# Mixup增强
def mixup_data(x, y, alpha=0.2):
    """Mixup数据增强"""
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size(0)
    index = torch.randperm(batch_size)
    
    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    
    return mixed_x, y_a, y_b, lam
def mixup_criterion(criterion, pred, y_a, y_b, lam):
    """Mixup损失函数"""
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
# CutMix增强
def cutmix_data(x, y, beta=1.0):
    """CutMix数据增强"""
    lam = np.random.beta(beta, beta)
    batch_size = x.size(0)
    index = torch.randperm(batch_size)
    
    # 随机裁剪区域
    W, H = x.size(2), x.size(3)
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)
    
    cx = np.random.randint(W)
    cy = np.random.randint(H)
    
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    
    x[:, :, bbx1:bbx2, bby1:bby2] = x[index, :, bbx1:bbx2, bby1:bby2]
    
    return x, y, y[index], lam

三、梯度问题解决

3.1 梯度消失与爆炸

"""
梯度消失原因:
- Sigmoid/Tanh导数小于1
- 多层链式相乘导致梯度指数衰减
解决方案:
1. 使用ReLU及其变体
2. 残差连接
3. BatchNorm
4. 合理的权重初始化
"""
# 激活函数对比
activations = {
    'Sigmoid': nn.Sigmoid(),      # 输出(0,1),梯度消失严重
    'Tanh': nn.Tanh(),            # 输出(-1,1),梯度消失较轻
    'ReLU': nn.ReLU(),            # 推荐,无梯度消失问题
    'LeakyReLU': nn.LeakyReLU(0.01),  # 解决ReLU神经元死亡
    'ELU': nn.ELU(),              # 更平滑的ReLU变体
    'GELU': nn.GELU(),            # Transformer常用
    'Swish': nn.SiLU(),           # 平滑自门控
}
# 梯度裁剪(解决梯度爆炸)
def train_with_gradient_clipping(model, dataloader, optimizer, max_norm=1.0):
    model.train()
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        
        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        
        optimizer.step()

3.2 权重初始化

"""
权重初始化的重要性:
- 好的初始化加速收敛
- 防止梯度消失/爆炸
常用方法:
1. Xavier初始化 (Glorot)
   - 适用于Sigmoid/Tanh
   - 方差 = 2 / (fan_in + fan_out)
2. Kaiming初始化 (He)
   - 适用于ReLU
   - 方差 = 2 / fan_in
"""
def init_weights_xavier(m):
    """Xavier初始化"""
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Conv2d):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)
def init_weights_kaiming(m):
    """Kaiming初始化(ReLU推荐)"""
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
        if m.bias is not None:
            nn.init.zeros_(m.bias)
# 应用初始化
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 10)
)
model.apply(init_weights_kaiming)
# 预训练权重初始化(迁移学习)
def init_from_pretrained(model, pretrained_model):
    """使用预训练权重初始化"""
    pretrained_dict = pretrained_model.state_dict()
    model_dict = model.state_dict()
    
    # 过滤不匹配的键
    pretrained_dict = {k: v for k, v in pretrained_dict.items() 
                       if k in model_dict and v.size() == model_dict[k].size()}
    
    model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)
    return model

四、高级训练技巧

4.1 混合精度训练

from torch.cuda.amp import autocast, GradScaler
"""
混合精度训练 (Mixed Precision):
- 使用FP16进行前向和反向传播
- FP32保存权重副本
- 减少显存占用,加速训练
"""
def train_mixed_precision(model, dataloader, optimizer, epochs):
    model.train()
    scaler = GradScaler()  # 梯度缩放器
    
    for epoch in range(epochs):
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            
            # 使用混合精度
            with autocast():
                outputs = model(inputs)
                loss = criterion(outputs, targets)
            
            # 缩放损失并反向传播
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
    
    return model

4.2 梯度累积

"""
梯度累积:模拟大batch训练
适用于显存不足的情况
"""
def train_with_gradient_accumulation(model, dataloader, optimizer, accumulation_steps=4):
    model.train()
    optimizer.zero_grad()
    
    for i, (inputs, targets) in enumerate(dataloader):
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 归一化损失
        loss = loss / accumulation_steps
        loss.backward()
        
        # 累积到一定步数后更新
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

4.3 Early Stopping

class EarlyStopping:
    """早停机制"""
    
    def __init__(self, patience=7, min_delta=0, mode='min'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.early_stop = False
    
    def __call__(self, score):
        if self.best_score is None:
            self.best_score = score
        elif self.mode == 'min':
            if score < self.best_score - self.min_delta:
                self.best_score = score
                self.counter = 0
            else:
                self.counter += 1
        else:
            if score > self.best_score + self.min_delta:
                self.best_score = score
                self.counter = 0
            else:
                self.counter += 1
        
        if self.counter >= self.patience:
            self.early_stop = True
        
        return self.early_stop
# 使用示例
early_stopping = EarlyStopping(patience=10, min_delta=0.001)
for epoch in range(100):
    # 训练...
    val_loss = validate(model, val_loader)
    
    if early_stopping(val_loss):
        print(f"Early stopping at epoch {epoch}")
        break

4.4 模型保存与加载

# 保存完整模型
torch.save(model, 'model_full.pth')
# 只保存权重(推荐)
torch.save(model.state_dict(), 'model_weights.pth')
# 保存检查点(包含优化器状态)
checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'loss': loss,
    'best_score': best_score
}
torch.save(checkpoint, 'checkpoint.pth')
# 加载检查点
def load_checkpoint(model, optimizer, scheduler, checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    return model, optimizer, scheduler, epoch, loss
# 加载权重
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()  # 切换到评估模式

五、训练监控

5.1 TensorBoard

from torch.utils.tensorboard import SummaryWriter
# 创建日志记录器
writer = SummaryWriter('runs/experiment_1')
def train_with_tensorboard(model, train_loader, val_loader, epochs):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            # 记录每100个batch的损失
            if batch_idx % 100 == 0:
                writer.add_scalar('Loss/train_batch', loss.item(), 
                                epoch * len(train_loader) + batch_idx)
        
        # 记录每个epoch的训练损失
        writer.add_scalar('Loss/train_epoch', train_loss / len(train_loader), epoch)
        
        # 验证
        model.eval()
        val_loss = validate(model, val_loader)
        writer.add_scalar('Loss/val', val_loss, epoch)
        
        # 记录学习率
        writer.add_scalar('Learning_rate', optimizer.param_groups[0]['lr'], epoch)
        
        # 记录参数分布
        for name, param in model.named_parameters():
            writer.add_histogram(f'Parameters/{name}', param, epoch)
            writer.add_histogram(f'Gradients/{name}', param.grad, epoch)
    
    writer.close()
# 在终端运行查看
# tensorboard --logdir=runs

5.2 进度条

from tqdm import tqdm
def train_with_progress_bar(model, dataloader, epochs):
    for epoch in range(epochs):
        model.train()
        pbar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{epochs}')
        
        for batch_idx, (inputs, targets) in enumerate(pbar):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            # 更新进度条信息
            pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'lr': f'{optimizer.param_groups[0]["lr"]:.6f}'
            })

参考资源


上一篇RNN循环神经网络 下一篇CNN卷积神经网络 返回深度学习基础 最后更新: 2026年4月14日

访问 --

讨论与反馈