LSTM
LSTM(长短期记忆网络)是深度学习中最重要的架构之一。1997 年由 Hochreiter 和 Schmidhuber 提出,专门设计用于解决传统 RNN 的梯度消失问题,能够学习长期依赖关系。
LSTM 的核心创新:
- 门控机制:通过遗忘门、输入门、输出门控制信息流动
- 细胞状态:信息高速公路,让梯度能够长距离传播
- 长期记忆:有效捕获序列中的长期依赖关系
《Long Short-Term Memory》论文截止 2025 年,谷歌学术总引用次数排名第 5,是深度学习领域最具影响力的论文之一。
LSTM 简介
LSTM(Long Short-Term Memory)是一种特殊的循环神经网络(RNN)架构,专门设计用于解决传统 RNN 在处理长序列数据时的两大问题:
- 梯度消失问题:反向传播时梯度逐层衰减,导致无法学习长期依赖
- 长期依赖问题:难以捕获序列中相距较远的元素之间的关系
LSTM 的应用场景:
- 自然语言处理(文本生成、机器翻译、情感分析)
- 时间序列预测(股票预测、天气预报)
- 语音识别
- 视频分析
- 异常检测
LSTM 的局限性:
- ❌ 对于超长序列仍有困难,已被 Transformer 在多个任务上超越
RNN 的问题
传统 RNN 在处理长序列时存在梯度消失问题:
RNN 的梯度消失问题:
在反向传播过程中,梯度需要通过时间步反向传播。当序列很长时(T很大),梯度会:
- 梯度消失:梯度在反向传播时指数级衰减,趋近于0
- 梯度爆炸:梯度在反向传播时指数级增长,趋近于无穷
具体来说:
梯度计算涉及多个时间步的连乘:
∂L/∂h₀ = ∂L/∂hₜ × ∏(∂hₜ/∂hₜ₋₁)
如果每个导数项 < 1,连乘后会趋近于0(梯度消失)
如果每个导数项 > 1,连乘后会趋近于∞(梯度爆炸)
结果:模型无法学习长期依赖关系(例如,句子开头的主语和结尾的谓语之间的关系)
LSTM 结构
LSTM 通过引入门控机制和细胞状态来解决梯度消失问题。
核心组件
LSTM 单元包含三个门和一个细胞状态:
- 遗忘门(Forget Gate):决定从细胞状态中丢弃什么信息
- 输入门(Input Gate):决定什么新信息存储到细胞状态
- 输出门(Output Gate):决定输出什么信息
- 细胞状态(Cell State):信息的"高速公路",梯度可以畅通无阻地流动
数学公式
LSTM 的计算过程:
# 输入:x_t(当前输入)、h_{t-1}(上一时刻隐藏状态)、c_{t-1}(上一时刻细胞状态)
# 1. 遗忘门:决定丢弃多少旧信息
f_t = σ(W_f · [h_{t-1}, x_t] + b_f)
# 2. 输入门:决定存储多少新信息
i_t = σ(W_i · [h_{t-1}, x_t] + b_i)
c̃_t = tanh(W_c · [h_{t-1}, x_t] + b_c) # 候选细胞状态
# 3. 更新细胞状态
c_t = f_t ⊙ c_{t-1} + i_t ⊙ c̃_t
# 4. 输出门:决定输出什么
o_t = σ(W_o · [h_{t-1}, x_t] + b_o)
h_t = o_t ⊙ tanh(c_t)
# 符号说明:
# σ:sigmoid 函数(输出0-1)
# tanh:双曲正切函数(输出-1到1)
# ⊙:逐元素乘法
# W、b:可学习的权重和偏置
门的作用理解:
- 遗忘门 (f_t):像一个过滤器
- 接近1:保留信息
- 接近0:遗忘信息
- 输入门 (i_t):像一个开关
- 接近1:接受新信息
- 接近0:拒绝新信息
- 输出门 (o_t):决定输出
- 控制有多少细胞状态信息传递到下一层
类比:细胞状态就像传送带
- 信息可以沿着传送带直接流动(不经过复杂的非线性变换)
- 门机制决定在每个时间步添加或删除什么信息
- 这使得梯度能够长距离传播而不消失
简化理解版本
import torch
import torch.nn as nn
# LSTM 的简化理解版本
class SimplifiedLSTM:
"""
LSTM 工作流程演示(非实际 实现)
"""
def step(self, x_t, h_prev, c_prev):
"""
x_t: 当前输入
h_prev: 上一时刻隐藏状态
c_prev: 上一时刻细胞状态
"""
# 拼接输入和隐藏状态
combined = torch.cat([h_prev, x_t], dim=1)
# 1. 遗忘门:要忘记什么?
f_t = torch.sigmoid(self.W_f @ combined + self.b_f)
# 例如:在阅读新句子时,忘记上一句的主语
# 2. 输入门:要记住什么新信息?
i_t = torch.sigmoid(self.W_i @ combined + self.b_i)
c_tilde = torch.tanh(self.W_c @ combined + self.b_c)
# 例如:记住新句子的主语
# 3. 更新细胞状态
c_t = f_t * c_prev + i_t * c_tilde
# 旧信息 × 遗忘门 + 新信息 × 输入门
# 4. 输出门:输出什么?
o_t = torch.sigmoid(self.W_o @ combined + self.b_o)
h_t = o_t * torch.tanh(c_t)
# 例如:基于主语和上下文生成对应的谓语
return h_t, c_t
# 实际使用 PyTorch 的 LSTM
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=1, batch_first=True)
# 输入:(batch_size, seq_len, input_size)
x = torch.randn(2, 5, 10) # 2个样本,每个序列长度5,特征维度10
# 前向传播
output, (h_n, c_n) = lstm(x)
print(f"输出形状: {output.shape}") # (2, 5, 20) - 每个时间步的输出
print(f"最终隐藏状态: {h_n.shape}") # (1, 2, 20) - 最后时刻的h
print(f"最终细胞状态: {c_n.shape}") # (1, 2, 20) - 最后时刻的c
基础示例
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.5):
super(LSTMModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
# LSTM 层
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout if num_layers > 1 else 0
)
# 全连接层
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
# x: (batch_size, seq_len, input_size)
# LSTM 前向传播
# output: (batch_size, seq_len, hidden_size)
# (h_n, c_n): 最后时刻的隐藏状态和细胞状态
output, (h_n, c_n) = self.lstm(x)
# 取最后一个时间步的输出
last_output = output[:, -1, :] # (batch_size, hidden_size)
# 通过全连接层
out = self.fc(last_output) # (batch_size, output_size)
return out
# 创建模型
model = LSTMModel(
input_size=10,
hidden_size=64,
num_layers=2,
output_size=1,
dropout=0.5
)
print(model)
# LSTMModel(
# (lstm): LSTM(10, 64, num_layers=2, batch_first=True, dropout=0.5)
# (fc): Linear(in_features=64, out_features=1, bias=True)
# )
# 测试前向传播
x = torch.randn(32, 20, 10) # 32个样本,序列长度20,特征维度10
output = model(x)
print(f"输出形状: {output.shape}") # (32, 1)
PyTorch LSTM 参数说明:
| 参数 | 说明 | 默认值 |
|---|---|---|
input_size | 输入特征维度 | 必需 |
hidden_size | 隐藏层维度 | 必需 |
num_layers | LSTM 层数 | 1 |
bias | 是否使用偏置 | True |
batch_first | 输入形状 | False |
dropout | Dropout 比率 | 0 |
bidirectional | 是否双向 | False |
batch_first 参数:
True:输入形状为(batch, seq, feature)(推荐)False:输入形状为(seq, batch, feature)(PyTorch默认)
序列到序列(Seq2Seq)
使用 LSTM 进行序列转换,如机器翻译。
import torch
import torch.nn as nn
class Encoder(nn.Module):
"""编码器:将输入序列编码为上下文向量"""
def __init__(self, input_size, embedding_dim, hidden_size, num_layers):
super().__init__()
self.embedding = nn.Embedding(input_size, embedding_dim)
self.lstm = nn.LSTM(
embedding_dim,
hidden_size,
num_layers,
batch_first=True
)
def forward(self, x):
# x: (batch_size, seq_len)
embedded = self.embedding(x) # (batch, seq, emb)
outputs, (h_n, c_n) = self.lstm(embedded)
return h_n, c_n # 返回最终状态作为上下文
class Decoder(nn.Module):
"""解码器:根据上下文生成输出序列"""
def __init__(self, output_size, embedding_dim, hidden_size, num_layers):
super().__init__()
self.embedding = nn.Embedding(output_size, embedding_dim)
self.lstm = nn.LSTM(
embedding_dim,
hidden_size,
num_layers,
batch_first=True
)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, h_0, c_0):
# x: (batch_size, seq_len)
# h_0, c_0: 编码器的最终状态
embedded = self.embedding(x)
outputs, (h_n, c_n) = self.lstm(embedded, (h_0, c_0))
predictions = self.fc(outputs) # (batch, seq, vocab)
return predictions, (h_n, c_n)
class Seq2Seq(nn.Module):
"""序列到序列模型"""
def __init__(self, encoder, decoder):
super().__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, src, trg, teacher_forcing_ratio=0.5):
# src: (batch_size, src_len) 源语言
# trg: (batch_size, trg_len) 目标语言
batch_size = trg.shape[0]
trg_len = trg.shape[1]
trg_vocab_size = self.decoder.fc.out_features
# 存储解码器输出
outputs = torch.zeros(batch_size, trg_len, trg_vocab_size)
# 编码器
h, c = self.encoder(src)
# 解码器第一个输入(<SOS> token)
input = trg[:, 0].unsqueeze(1)
# 逐步解码
for t in range(1, trg_len):
output, (h, c) = self.decoder(input, h, c)
outputs[:, t, :] = output.squeeze(1)
# Teacher forcing:随机使用真实值或预测值
teacher_force = torch.rand(1).item() < teacher_forcing_ratio
top1 = output.argmax(2)
input = trg[:, t].unsqueeze(1) if teacher_force else top1
return outputs
# 创建 Seq2Seq 模型
encoder = Encoder(
input_size=10000, # 源语言词汇表大小
embedding_dim=256,
hidden_size=512,
num_layers=2
)
decoder = Decoder(
output_size=8000, # 目标语言词汇表大小
embedding_dim=256,
hidden_size=512,
num_layers=2
)
model = Seq2Seq(encoder, decoder)
# 示例输入
src = torch.randint(0, 10000, (32, 20)) # 源序列
trg = torch.randint(0, 8000, (32, 15)) # 目标序列
# 前向传播
outputs = model(src, trg, teacher_forcing_ratio=0.5)
print(f"输出形状: {outputs.shape}") # (32, 15, 8000)
Seq2Seq 关键概念:
-
Teacher Forcing
- 训练时:使用真实目标序列作为解码器输入(加速收敛)
- 推理时:使用模型自己的预测作为下一步输入
# 训练时
input = trg[:, t] # 使用真实值
# 推理时
input = prediction # 使用预测值 -
编码器-解码器架构
- 编码器:压缩源序列为固定长度的上下文向量
- 解码器:根据上下文生成目标序列
-
注意力机制(后续改进)
- 解决固定长度上下文的瓶颈
- 让解码器在每步关注编码器的不同部分
文本情感分析
情感分析示例(完整可运行)。
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# 自定义数据集
class SentimentDataset(Dataset):
def __init__(self, texts, labels, vocab, max_len=100):
self.texts = texts
self.labels = labels
self.vocab = vocab
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
# 文本转索引
indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in text.split()]
# 填充或截断
if len(indices) < self.max_len:
indices += [self.vocab['<PAD>']] * (self.max_len - len(indices))
else:
indices = indices[:self.max_len]
return torch.LongTensor(indices), torch.LongTensor([label])
# 模型定义
class SentimentLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
self.lstm = nn.LSTM(
embedding_dim,
hidden_size,
num_layers,
batch_first=True,
dropout=0.5
)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, x):
embedded = self.embedding(x)
lstm_out, (h_n, c_n) = self.lstm(embedded)
# 取最后一个时间步(或使用平均池化)
last_hidden = lstm_out[:, -1, :]
out = self.fc(last_hidden)
return out
# 示例数据
texts = [
"I love this movie",
"This is terrible",
"Great performance",
"Waste of time"
]
labels = [1, 0, 1, 0] # 1=正面, 0=负面
# 构建词汇表(简化)
vocab = {
'<PAD>': 0,
'<UNK>': 1,
'I': 2,
'love': 3,
'this': 4,
'movie': 5,
'is': 6,
'terrible': 7,
'Great': 8,
'performance': 9,
'Waste': 10,
'of': 11,
'time': 12
}
# 创建数据集和数据加载器
dataset = SentimentDataset(texts, labels, vocab, max_len=20)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
# 创建模型
model = SentimentLSTM(
vocab_size=len(vocab),
embedding_dim=50,
hidden_size=128,
num_layers=2,
num_classes=2
)
# 训练
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练循环
num_epochs = 10
for epoch in range(num_epochs):
model.train()
total_loss = 0
for batch_texts, batch_labels in dataloader:
# 前向传播
outputs = model(batch_texts)
loss = criterion(outputs, batch_labels.squeeze())
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(dataloader)
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
# 预测
model.eval()
test_text = "I love this"
test_indices = [vocab.get(word, vocab['<UNK>']) for word in test_text.split()]
test_indices += [vocab['<PAD>']] * (20 - len(test_indices))
test_tensor = torch.LongTensor([test_indices])
with torch.no_grad():
output = model(test_tensor)
prediction = torch.argmax(output, dim=1)
print(f"预测结果: {'正面' if prediction.item() == 1 else '负面'}")
Bidirectional LSTM(双向LSTM)
双向 LSTM 同时从前向后和从后向前处理序列,能够捕获双向的上下文信息。
import torch
import torch.nn as nn
class BiLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size):
super().__init__()
# 双向 LSTM
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
bidirectional=True # 关键参数
)
# 注意:双向LSTM的输出维度是 hidden_size * 2
self.fc = nn.Linear(hidden_size * 2, output_size)
def forward(self, x):
# x: (batch_size, seq_len, input_size)
# LSTM输出
# output: (batch_size, seq_len, hidden_size * 2)
# h_n: (num_layers * 2, batch_size, hidden_size)
output, (h_n, c_n) = self.lstm(x)
# 取最后时间步的输出
last_output = output[:, -1, :] # (batch_size, hidden_size * 2)
# 分类
out = self.fc(last_output)
return out
# 使用
model = BiLSTM(input_size=10, hidden_size=64, num_layers=2, output_size=2)
x = torch.randn(32, 20, 10)
output = model(x)
print(f"输出形状: {output.shape}") # (32, 2)
双向 LSTM 的优势:
# 单向 LSTM:只能看到过去
"我 爱 吃 [苹果]" → 只看到 "我 爱 吃"
# 双向 LSTM:既看过去又看未来
"我 爱 吃 [苹果]" → 看到 "我 爱 吃" + "(未来没有了)"
"我 爱 吃 [苹果] 和 香蕉" → 看到 "我 爱 吃" + "和 香蕉"
# 适用场景:
- ✅ 文本分类(整个句子已知)
- ✅ 命名实体识别(需要上下文)
- ✅ 词性标注
- ❌ 实时预测(未来信息未知)
- ❌ 文本生成(逐字生成)
训练技巧
梯度裁剪
防止梯度爆炸的重要技术。
import torch
import torch.nn as nn
import torch.optim as optim
model = LSTMModel(...)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
# 训练循环
for epoch in range(num_epochs):
for batch_x, batch_y in dataloader:
# 前向传播
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
# 反向传播
optimizer.zero_grad()
loss.backward()
# 梯度裁剪(重要!)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
# 或使用值裁剪
# torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)
optimizer.step()
梯度裁剪方法:
-
梯度范数裁剪(推荐)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
# 如果梯度范数 > 5.0,缩放梯度 -
梯度值裁剪
torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)
# 每个梯度值限制在 [-0.5, 0.5]
何时使用:
- LSTM/GRU:几乎总是使用
- 训练不稳定:尝试裁剪
- loss 出现 NaN:降低 max_norm
推荐值:
- max_norm: 1.0 - 10.0(常用5.0)
过拟合防止
import torch
import torch.nn as nn
class RobustLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.5):
super().__init__()
self.embedding = nn.Embedding(input_size, 128)
# 1. LSTM dropout(层间)
self.lstm = nn.LSTM(
128,
hidden_size,
num_layers,
batch_first=True,
dropout=dropout if num_layers > 1 else 0
)
# 2. 额外的 dropout
self.dropout = nn.Dropout(dropout)
# 3. 全连接层
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
embedded = self.embedding(x)
# LSTM
lstm_out, _ = self.lstm(embedded)
# Dropout
dropped = self.dropout(lstm_out[:, -1, :])
# 输出
out = self.fc(dropped)
return out
# 其他防止过拟合的方法
# 4. 权重衰减(L2正则化)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
# 5. 早停(Early Stopping)
class EarlyStopping:
def __init__(self, patience=7, min_delta=0):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.best_loss = None
self.early_stop = False
def __call__(self, val_loss):
if self.best_loss is None:
self.best_loss = val_loss
elif val_loss > self.best_loss - self.min_delta:
self.counter += 1
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_loss = val_loss
self.counter = 0
# 使用早停
early_stopping = EarlyStopping(patience=10)
for epoch in range(num_epochs):
train_loss = train_one_epoch(model, train_loader)
val_loss = validate(model, val_loader)
early_stopping(val_loss)
if early_stopping.early_stop:
print(f"Early stopping at epoch {epoch}")
break
# 6. 数据增强
# 对于文本:同义词替换、随机删除、回译
# 对于时间序列:添加噪声、时间扭曲
防止过拟合的方法优先级:
- 增加数据(最有效)
- Dropout(0.3-0.5)
- 权重衰减(1e-4 到 1e-6)
- 早停(监控验证集)
- 降低模型复杂度(减少层数或隐藏单元)
- 数据增强
# 完整配置示例
model = nn.LSTM(..., dropout=0.5) # LSTM dropout
optimizer = Adam(..., weight_decay=1e-5) # L2正则
scheduler = ReduceLROnPlateau(...) # 学习率调度
early_stopping = EarlyStopping(patience=10) # 早停
学习率调度
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingLR
model = LSTMModel(...)
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 方式1:验证集loss不下降时降低学习率
scheduler = ReduceLROnPlateau(
optimizer,
mode='min', # 监控指标是否应该降低
factor=0.5, # 学习率缩放因子
patience=5, # 容忍多少个epoch
verbose=True
)
for epoch in range(num_epochs):
train_loss = train(model, train_loader)
val_loss = validate(model, val_loader)
# 更新学习率
scheduler.step(val_loss)
# 方式2:余弦退火
scheduler = CosineAnnealingLR(
optimizer,
T_max=50, # 周期
eta_min=1e-6 # 最小学习率
)
for epoch in range(num_epochs):
train(model, train_loader)
scheduler.step() # 每个epoch后更新
# 方式3:分段常数
scheduler = optim.lr_scheduler.StepLR(
optimizer,
step_size=30, # 每30个epoch
gamma=0.1 # 学习率乘以0.1
)
调试技巧
import torch
import torch.nn as nn
# 1. 检查输入输出形状
def check_shapes(model, input_shape):
x = torch.randn(*input_shape)
print(f"输入形状: {x.shape}")
output = model(x)
print(f"输出形状: {output.shape}")
# 检查中间层
for name, module in model.named_modules():
if isinstance(module, nn.LSTM):
print(f"{name} - LSTM层")
# 2. 检查梯度
def check_gradients(model):
for name, param in model.named_parameters():
if param.grad is not None:
grad_norm = param.grad.norm().item()
print(f"{name}: 梯度范数 = {grad_norm:.4f}")
if grad_norm == 0:
print(f" 警告:{name} 的梯度为0!")
elif grad_norm > 100:
print(f" 警告:{name} 的梯度过大!")
# 3. 检查权重
def check_weights(model):
for name, param in model.named_parameters():
print(f"{name}:")
print(f" 均值: {param.data.mean():.4f}")
print(f" 标准差: {param.data.std():.4f}")
print(f" 最小值: {param.data.min():.4f}")
print(f" 最大值: {param.data.max():.4f}")
# 使用
model = LSTMModel(...)
check_shapes(model, (32, 50, 10))
# 训练一步后检查梯度
loss.backward()
check_gradients(model)
check_weights(model)
性能优化
import torch
import torch.nn as nn
# 1. 使用 GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
x = x.to(device)
# 2. 混合精度训练(减少内存,加速训练)
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for epoch in range(num_epochs):
for batch_x, batch_y in dataloader:
batch_x = batch_x.to(device)
batch_y = batch_y.to(device)
optimizer.zero_grad()
# 使用自动混合精度
with autocast():
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
# 缩放梯度
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# 3. DataLoader 优化
dataloader = DataLoader(
dataset,
batch_size=64,
shuffle=True,
num_workers=4, # 多进程加载数据
pin_memory=True # 加速数据传输到GPU
)
# 4. 模型编译(PyTorch 2.0+)
model = torch.compile(model) # 自动优化
性能优化建议:
-
批次大小
- GPU:尽量大(受显存限制)
- CPU:32-128
-
序列长度
- 尽量不要太长(计算复杂度 O(n²))
- 可以截断或采样
-
隐藏层大小
- 常用:64、128、256、512
- 太小:欠拟合
- 太大:过拟合、慢
-
层数
- 1-2层:大多数任务够用
- 3-4层:复杂任务
# 性能对比测试
import time
configs = [
{'hidden': 64, 'layers': 1},
{'hidden': 128, 'layers': 2},
{'hidden': 256, 'layers': 3}
]
for config in configs:
model = LSTMModel(hidden_size=config['hidden'],
num_layers=config['layers'])
start = time.time()
train(model)
duration = time.time() - start
print(f"配置 {config}: {duration:.2f}秒")