2022DCIC-基于文本字符的交易验证码识别
代码: https://github.com/zyf-xtu/captcha_ocr
本次的baseline由湘潭大学的maple同学贡献
实验结果
backbone | 线上表现 |
线下验证 |
resnet34 | 0.84 |
0.86 |
efficientnet-b6 | 0.96 |
0.95 |
efficientnet-b7 | 0.97+ |
- |
上分点
1.使用更强的骨干网络
2.使用数据增强mixup和cutmix等策略,暂未开源
赛题分析
比赛数据
其中训练集15000张
测试集25000张
算法思路
本方案是基于多标签分类,字符类别62,其中包含0-9,a-z,A-Z共计62个字符,需要区分大小写,单张验证码字符个数为4,使用多标签分类,需要单张图像输出62*4=248个类别
可以使用vgg、resnet、efficient等基础网络,只需要将原始网络中的分类数1000改为248即可
需要使用多标签分类损失函数:MultiLabelSoftMarginLoss
代码详解
baseline是基于pytorch框架的,版本为1.6+,其他库如OpenCV、torchvision自行安装。如果使用efficientnet模型,使用请参考
https://github.com/lukemelas/EfficientNet-PyTorch
数据处理:
# 数据标签
source = [str(i) for i in range(0, 10)]
source += [chr(i) for i in range(97, 97 + 26)]
source += [chr(i) for i in range(65, 65 + 26)]
alphabet = ''.join(source)
# 制作数据集
def make_dataset(data_path, alphabet, num_class, num_char):
samples = []
for img_path in data_path:
target_str = img_path.split('.png')[0][-4:]
assert len(target_str) == num_char
target = []
for char in target_str:
vec = [0] * num_class
vec[alphabet.find(char)] = 1
target += vec
samples.append((img_path, target))
print(len(samples))
return samples
# 验证数据处理类
class CaptchaData(Dataset):
def __init__(self, data_path, num_class=62, num_char=4,
transform=None, target_transform=None, alphabet=alphabet):
super(Dataset, self).__init__()
self.data_path = data_path
self.num_class = num_class
self.num_char = num_char
self.transform = transform
self.target_transform = target_transform
self.alphabet = alphabet
self.samples = make_dataset(self.data_path, self.alphabet,
self.num_class, self.num_char)
def __len__(self):
return len(self.samples)
def __getitem__(self, index):
img_path, target = self.samples[index]
img = img_loader(img_path)
if self.transform is not None:
img = self.transform(img)
if self.target_transform is not None:
target = self.target_transform(target)
return img, torch.Tensor(target)
# 定义图像transform
train_transform = transforms.Compose([
transforms.Resize((256, 256)), # 图像放缩
transforms.RandomRotation((-5, 5)), # 随机旋转
# transforms.RandomVerticalFlip(p=0.2), # 随机旋转
transforms.ToTensor(), # 转化成张量
transforms.Normalize(
mean=train_mean,
std=train_std
)
])
val_transform = transforms.Compose([
transforms.Resize((256, 256)), # 图像放缩
transforms.ToTensor(), # 转化成张量
transforms.Normalize(
mean=train_mean,
std=train_std
)
])
模型训练和验证
# 训练
def train(model, loss_func, optimizer, checkpoints, epochs, lr_scheduler=None):
print('Train......................')
# 记录每个epoch的loss和acc
record = []
best_acc = 0
best_epoch = 0
# 训练过程
for epoch in range(1, epochs):
# 设置计时器,计算每个epoch的用时
start_time = time.time()
model.train() # 保证每一个batch都能进入model.train()的模式
# 记录每个epoch的loss和acc
train_loss, train_acc, val_loss, val_acc = [], [], [], []
for i, (inputs, labels) in enumerate(train_data):
# print(i, inputs, labels)
inputs = inputs.to(device)
labels = labels.to(device)
# 预测输出
outputs = model(inputs)
# 计算损失
loss = loss_func(outputs, labels)
# print(outputs)
# 因为梯度是累加的,需要清零
optimizer.zero_grad()
# 获取学习率
epoch_lr = optimizer.param_groups[0]['lr']
# 反向传播
loss.backward()
# 优化器
optimizer.step()
# 计算准确率
acc = calculat_acc(outputs, labels)
train_acc.append(float(acc))
train_loss.append(float(loss))
if lr_scheduler:
lr_scheduler.step()
# 验证集进行验证
with torch.no_grad():
model.eval()
for i, (inputs, labels) in enumerate(val_data):
inputs = inputs.to(device)
labels = labels.to(device)
# 预测输出
outputs = model(inputs)
# 计算损失
loss = loss_func(outputs, labels)
# 计算准确率
acc = calculat_acc(outputs, labels)
val_acc.append(float(acc))
val_loss.append(float(loss))
# 计算每个epoch的训练损失和精度
train_loss_epoch = torch.mean(torch.Tensor(train_loss))
train_acc_epoch = torch.mean(torch.Tensor(train_acc))
# 计算每个epoch的验证集损失和精度
val_loss_epoch = torch.mean(torch.Tensor(val_loss))
val_acc_epoch = torch.mean(torch.Tensor(val_acc))
# 记录训练过程
record.append(
[epoch, train_loss_epoch.item(), train_acc_epoch.item(), val_loss_epoch.item(), val_acc_epoch.item()])
end_time = time.time()
print(
'epoch:{} | time:{:.4f} | lr:{} | train_loss:{:.4f} | train_acc:{:.4f} | eval_loss:{:.4f} | val_acc:{:.4f}'.format(
epoch,
end_time - start_time,
epoch_lr,
train_loss_epoch,
train_acc_epoch,
val_loss_epoch,
val_acc_epoch))
# 记录验证集上准确率最高的模型
best_model_path = checkpoints + "/"+'best_model.pth'
if val_acc_epoch >= best_acc:
best_acc = val_acc_epoch
best_epoch = epoch
torch.save(model, best_model_path)
# 计算准确率
def calculat_acc(output, target):
output, target = output.view(-1, 62), target.view(-1, 62)
output = nn.functional.softmax(output, dim=1)
output = torch.argmax(output, dim=1)
target = torch.argmax(target, dim=1)
output, target = output.view(-1, 4), target.view(-1, 4)
correct_list = []
for i, j in zip(target, output):
if torch.equal(i, j):
correct_list.append(1)
else:
correct_list.append(0)
acc = sum(correct_list) / len(correct_list)
return acc
模型推理
def predict():
model_path = './checkpoints/best_model.pth'
test_dir = './data/test'
test_mean, test_std = init_normalize(test_dir, size=[256, 256])
transform = transforms.Compose([
transforms.Resize((256, 256)),
transforms.ToTensor(),
transforms.Normalize(
mean=test_mean,
std=test_std
)
])
print(torch.cuda.is_available())
model = torch.load(model_path)
if torch.cuda.is_available():
model = model.cuda()
images = os.listdir(test_dir)
images.sort(key=lambda x: int(x[:-4]))
res = []
for img in images:
img_path = os.path.join(test_dir, img)
image_read = Image.open(img_path)
gray = image_read.convert('RGB')
gray = transform(gray)
image = gray.view(1, 3, 256, 256).cuda()
output = model(image)
output = output.view(-1, 62)
output = nn.functional.softmax(output, dim=1)
output = torch.argmax(output, dim=1)
output = output.view(-1, 4)[0]
pred = ''.join([alphabet[i] for i in output.cpu().numpy()])
# print([alphabet[i] for i in output.cpu().numpy()])
print(img, pred)
res.append({'num': int(img[:-4]), 'tag': pred})
header = ['num', 'tag']
os.makedirs('sub', exist_ok=True)
with open('sub/submit_021601.csv', 'w', encoding='utf_8_sig') as f:
f_csv = csv.DictWriter(f, header)
f_csv.writeheader()
f_csv.writerows(res)