零、前言
图片验证码识别是一个老生常谈的话题,目前的方法主要包括图片分析、打码平台以及学习算法(机器学习、深度学习)等。本文就使用Tensorflow实现一个验证码识别模型,同时会展示使用Tensorflow进行深度学习的完整过程。
一、数据采集
图片验证码的数据采集就是收集验证码图片及其对应的验证码值,这里我们使用Python的captcha模块进行自动化的生成,训练模块通过调用下面的代码不断获取数据。
#!/usr/bin/python
# -*- coding: utf-8 -*
from captcha.image import ImageCaptcha
import numpy as np
import random
import string
class captcha_tensorflow():
def __init__(self, width = 160, height = 60, char_num = 4,
characters = string.digits + string.ascii_uppercase + string.ascii_lowercase):
# 验证码的宽度
self.width = width
# 验证码的高度
self.height = height
# 验证码字符数量
self.char_num = char_num
# 验证码字符集
self.characters = characters
# 验证码字符集种类
self.classes = len(characters)
def get_parameter(self):
return self.width, self.height, self.char_num, self.characters, self.classes
def get_batch_captcha(self, batch_size=64):
"""生成训练批量验证码"""
# 维度:批次大小, 宽度, 高度, 通道数 (本身通道数为3,后面被转换成1)
img_x = np.zeros([batch_size, self.height, self.width, 1])
img_y = np.zeros([batch_size, self.char_num, self.classes])
for i in range(batch_size):
image = ImageCaptcha(width=self.width, height=self.height)
captcha_str = ''.join(random.sample(self.characters, self.char_num))
# RGB转换为灰度图
img = image.generate_image(captcha_str).convert("L").getdata()
img_data = np.array(img)
img_x[i] = np.reshape(img_data, [self.height, self.width, 1])/255.0
# 对验证码字符串进行二进制编码
for j,ch in enumerate(captcha_str):
img_y[i, j, self.characters.find(ch)] = 1
img_y = np.reshape(img_y, (batch_size, self.char_num*self.classes))
return img_x, img_y
def gen_test_captcha(self):
"""生成测试验证码"""
image = ImageCaptcha(width = self.width,height = self.height)
captcha_str = ''.join(random.sample(self.characters,self.char_num))
img = image.generate_image(captcha_str)
# print img.im
img.save(captcha_str + '.jpg')
if __name__ == '__main__':
test = captcha_tensorflow()
test.gen_test_captcha()
# print test.get_batch_captcha(1)
这里重点讲解一下get_batch_captcha函数,它用于获得一个批次的验证码数据,包含图片数据及其标签。图片数据的格式为[batch_size, height, width, channels],分别为批次大小,图片高度,图片宽度,通道个数,这是卷积函数convd的默认输入格式;标签数据的格式为[batch_size, class],分别为批次大小和验证码图片的值,这里的值被二进制编码过了。
深度学习里进行样本训练一般每次都只会读取batch_size大小的数据,当然也可以将batch_size的大小定义为样本集的大小,这样所有的样本就会一次性全部读入,但batch_size的作用在于平衡算法效率与内存容量之间的平衡,batch_size太大内存难以承受同时也会导致迭代次数减少,使参数修正变得缓慢;batch_size太小,算法难以收敛。
二、 模型构建
模型使用了CNN卷积神经网络,包含了三层卷积层和两层全连接层,每一次卷积跟着一次池化操作,如图所示。
#-*-coding:utf-8-*-
import tensorflow as tf
import string
class create_model():
"""模型构建"""
def __init__(self, width = 160, height = 60, char_num = 4,
characters = string.digits + string.ascii_uppercase + string.ascii_lowercase):
# 验证码的宽度
self.width = width
# 验证码的高度
self.height = height
# 验证码字符数量
self.char_num = char_num
# 验证码字符集
self.characters = characters
# 验证码字符集种类
self.classes = len(characters)
def weight_init(self, shape, name):
"""获得权重参数"""
return tf.get_variable(name, shape, initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
def bias_init(self, shape, name):
"""获得偏置参数"""
return tf.get_variable(name, shape, initializer=tf.constant_initializer(0.0))
def conv2d(self, x, conv_w):
"""计算二维数据的卷积,两个维度的步长都是1,填充边缘"""
return tf.nn.conv2d(x, conv_w, strides=[1, 1, 1, 1], padding="SAME")
def max_pool(self, x, size):
"""计算池化,池化区域为size*size,填充边缘"""
return tf.nn.max_pool(x, ksize=[1, size, size, 1], strides=[1, size, size, 1], padding="SAME")
def inference(self, input_data):
"""定义网络结构,进行前向计算"""
# 卷积层
with tf.name_scope("conv1"):
# 卷积核大小是5x5,输入通道为1,输出通道为32
w_conv1 = self.weight_init([5, 5, 1, 32], "w_conv1")
b_conv1 = self.bias_init([32], "b_conv1")
# 卷积之后,图片大小变成160x60 (160/1=160, 60/1=60)
h_conv1 = tf.nn.relu(self.conv2d(input_data, w_conv1) + b_conv1)
# 池化之后,图片大小变成80x30 (160/2=80, 60/2=30)
h_pool1 = self.max_pool(h_conv1, 2)
with tf.name_scope("conv2"):
# 卷积核大小是5x5,输入通道为32,输出通道为64
w_conv2 = self.weight_init([5, 5, 32, 64], "w_conv2")
b_conv2 = self.bias_init([64], "b_conv2")
# 卷积之后,图片大小变为80x30 (80/1=66,30/1=30)
h_conv2 = tf.nn.relu(self.conv2d(h_pool1, w_conv2) + b_conv2)
# 池化之后,图片大小变为40x15 (80/2=40, 30/2=15)
h_pool2 = self.max_pool(h_conv2, 2)
with tf.name_scope("conv3"):
# 卷积核大小为5x5,输入通道为64,输出通道为64
w_conv3 = self.weight_init([5, 5, 64, 64], "w_conv3")
b_conv3 = self.bias_init([64], "b_conv3")
# 卷积之后,图片大小变为40x15 (40/1=40, 15/1=15)
h_conv3 = tf.nn.relu(self.conv2d(h_pool2, w_conv3) + b_conv3)
# 池化之后,图片大小变为20x8 (40/2=20, 15/2=8)
h_pool3 = self.max_pool(h_conv3, 2)
# 全连接层
with tf.name_scope("fc1"):
# 将池化后的数据拉长为20*8*64=10240的一维向量
# 再做全连接,第一层输入为10240,输出为1024
w_fc1 = self.weight_init([20*8*64, 1024], "w_fc1")
b_fc1 = self.bias_init([1024], "b_fc1")
h_fc1 = tf.nn.relu(tf.matmul(tf.reshape(h_pool3, [-1, 20*8*64]), w_fc1) + b_fc1)
with tf.name_scope("fc2"):
# 第二层输入长度为1024,输出长度为验证码的种类数
w_fc2 = self.weight_init([1024, self.char_num*self.classes], "w_fc2")
b_fc2 = self.bias_init([self.char_num*self.classes], "b_fc2")
h_fc2 = tf.matmul(h_fc1, w_fc2) + b_fc2
return h_fc2
代码每一步都做了详细的注释,在经过三层卷积和池化之后,输入数据由刚开始的[batch_size, 60, 160, 1]变成了[batch_size, 8, 20, 64]。卷积的计算过程就是提取特征的过程,我们不用关心到底提取了什么特征,或者特征的实际意义什么,只需要知道最后得出来得这个8x20x64得矩阵可以表示特征的值。然后将[8, 20, 64]展开成一维向量,向量的长度为8x20x64=10240;再用两层的全连接层和relu激活函数将这10240维的特征进行映射,输出维度为验证码可能的种类数62×4=248。具体的卷积和池化是怎么进行的,可以自行搜索。
三、模型训练
模型训练常规的流程就是定义占位符(数据输入),变量、损失函数等,然后启动Session执行图计算,程序每轮获取批次大小为64的数据进行训练,然后每100轮进行一次测试,当准确率大于95%时保存模型推出训练。另外在代码中做了变量cross_entropy(损失值)和accuracy(准确率)记录,训练完成后可以在tensorboard中看到变化过程。
#-*-coding:utf-8-*-
import tensorflow as tf
import numpy as np
import string
import captcha_generate
import captcha_model
from datetime import datetime
def main():
captcha_gen = captcha_generate.captcha_tensorflow()
width, height, char_num, characters, classes = captcha_gen.get_parameter()
# 定义输入
img_x = tf.placeholder(tf.float32, [None, height, width, 1])
img_y = tf.placeholder(tf.float32, [None, char_num*classes])
# 获取模型
captcha_mod = captcha_model.create_model()
pre_y = captcha_mod.inference(img_x)
# 定义损失函数及优化方法
cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=img_y,logits=pre_y))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
# 计算准确率
predict = tf.reshape(pre_y, [-1, char_num, classes])
real = tf.reshape(img_y, [-1, char_num, classes])
correct_prediction = tf.equal(tf.argmax(predict, 2), tf.argmax(real, 2))
correct_prediction = tf.cast(correct_prediction, tf.float32)
accuracy = tf.reduce_mean(correct_prediction)
# 收集变量的值
tf.summary.scalar("cross_entropy", cross_entropy)
tf.summary.scalar("accuracy", accuracy)
tf.summary.image("input", img_x)
merged_summary = tf.summary.merge_all()
# 启动Session
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter("../../log/captcha/01", graph=sess.graph)
step = 0
while True:
batch_x,batch_y = captcha_gen.get_batch_captcha(64)
_, loss, summary = sess.run([train_step, cross_entropy, merged_summary],feed_dict={img_x: batch_x, img_y: batch_y})
writer.add_summary(summary, step)
# print ('step:%d,loss:%f' % (step,loss))
if step % 100 == 0:
batch_x_test,batch_y_test = captcha_gen.get_batch_captcha(100)
loss, acc = sess.run([cross_entropy, accuracy], feed_dict={img_x: batch_x_test, img_y: batch_y_test})
print 'step:%d, loss:%f, accuracy:%f' % (step, loss, acc)
if acc > 0.1:
saver.save(sess,"model/01/captcha_model.ckpt")
break
step += 1
if __name__ == '__main__':
begin = datetime.now()
print begin
main()
end = datetime.now()
print end
print end-begin
模型训练节将近四个小时
四、模型检验
模型验证则是加载保存的模型然后进行验证码识别即可。
#-*-coding:utf-8-*-
from PIL import Image, ImageFilter
import tensorflow as tf
import numpy as np
import string
import sys
import captcha_generate
import captcha_model
def main():
# 获取参数
captcha_gen = captcha_generate.captcha_tensorflow()
width, height, char_num, characters, classes = captcha_gen.get_parameter()
# 读取图片
gray_image = Image.open(sys.argv[1]).convert('L')
img = np.array(gray_image.getdata())
test_x = np.reshape(img, [height, width, 1])/255.0
# 定义占位符
img_x = tf.placeholder(tf.float32, [None, height, width, 1])
# 预测
captcha_mod = captcha_model.create_model()
pre_y = captcha_mod.inference(img_x)
predict = tf.argmax(tf.reshape(pre_y, [-1, char_num, classes]),2)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# 加载模型
saver.restore(sess, "model/00/captcha_model.ckpt")
pre_list = sess.run(predict,feed_dict={img_x: [test_x]})
for i in pre_list:
s = ''
for j in i:
s += characters[j]
print s
if __name__ == '__main__':
main()
然后生成如下的验证码图片进行识别测试
五、Tensorboard
Tensorboard是tensorflow自带的可视化工具,它可以对tensorflow生成的图、中间变量、图片等进行可视化展示。如下图所示是验证码识别模型的图结构。
因为我们在代码中记录了损失值和准确率的值,因此我们还可以看到他们的变化过程。横坐标是训练次数,纵坐标是值。