目标是构建一个基于向量的问答检索系统,即:问题和答案是固定的数据集,通过将用户输入的问题编码为向量,在向量数据库中匹配最相似的问题,并返回问题对应的答案。
实现时,希望能够对输入的问题进行类别判别。例如:我们做法律的问答检索系统,就希望能够判断出用户问题是否和法律相关,如果相关我们再进行后续处理。
训练这样的一个二分类模型使用到的数据为:
- 正样本:就是拿到的正常的问题的语料
- 负样本:来源就是开放的百度问答数据、以及部分其他行业的问答数据,从中筛选出不同的问题
这种二分类问题,在做的时候方案太多了,这里就使用了三种方法:
- 朴素贝叶斯:根据词的频数向量
- 支持向量机:用 word2vec 将问题编码为向量
- Albert:mini 版本的 bert 预训练模型
从效果来看,朴素贝叶斯使用的高维的稀疏向量,虽然特征维度高,速度也还是可以接受的。其效果不如 albert 和 svm。albert 虽然是 mini 版本,也比另外两个大,并且训练过程产生了很多 checkpoint,需要我们自己来选择用哪个。svm + word2vec 的方式,效果非常不错,训练时间也不长,模型也不大,我比较喜欢这个模型。
- 基于 Albert 的方法 {#title-0} ===========================
预训练模型:https://huggingface.co/clue/albert_chinese_tiny
下面完整的实现代码:
from transformers import AlbertForSequenceClassification
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from tqdm import tqdm
from datasets import load_from_disk
import torch.optim as optim
import torch.nn as nn
import glob
import torch
import numpy as np
from datasets import Dataset
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
# 计算设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def collate_function(batch_data, tokenizer):
titles, labels = [], []
for data in batch_data:
titles.append(data['title'])
labels.append(data['label'])
title_tensor = tokenizer.batch_encode_plus(titles,
add_special_tokens=True,
padding='longest',
return_tensors='pt')
title_tensor = {key: value.to(device) for key, value in title_tensor.items()}
label_tensor = torch.tensor(labels, device=device)
return title_tensor, label_tensor
def train_albert():
# https://huggingface.co/clue/albert_chinese_tiny
estimator = AlbertForSequenceClassification.from_pretrained('pretrained/albert_chinese_tiny', num_labels=2).to(device)
tokenizer = BertTokenizer.from_pretrained('pretrained/albert_chinese_tiny')
traindata = load_from_disk('data/intention.data')['train']
dataloader = DataLoader(traindata, batch_size=128, shuffle=True, collate_fn=lambda data: collate_function(data, tokenizer))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(estimator.parameters(), lr=1e-5)
scheduler = ReduceLROnPlateau(optimizer, factor=0.6, patience=2, cooldown=2, verbose=True)
for epoch in range(30):
total_loss, total_size, total_corr = 0.0, 0, 0
progress = tqdm(range(len(dataloader)))
for title_tensor, label_tensor in dataloader:
outputs = estimator(**title_tensor)
loss = criterion(outputs.logits, label_tensor)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 预测标签
y_pred = torch.argmax(outputs.logits, dim=-1)
total_corr += (y_pred == label_tensor).sum().item()
total_loss += loss.item() * len(label_tensor)
total_size += len(label_tensor)
# 更新进度
desc = '%2d. %6.1f %5d/%5d %.4f %.2E' % (epoch + 1, total_loss, total_corr, total_size, total_corr/total_size, scheduler.optimizer.param_groups[0]['lr'])
progress.set_description(desc)
progress.update()
scheduler.step(total_loss)
progress.close()
if epoch > 5:
model_save_path = 'finish/intention/albert/%0d_intention_albert_loss_%.4f' % (epoch + 1, total_loss)
estimator.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
@torch.no_grad()
def eval_model(model_name):
estimator = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device).eval()
tokenizer = BertTokenizer.from_pretrained(model_name)
traindata = load_from_disk('data/intention.data')['test']
dataloader = DataLoader(traindata, batch_size=128, shuffle=True, collate_fn=lambda data: collate_function(data, tokenizer))
model_name = model_name[model_name.rfind('/') + 1:]
progress = tqdm(range(len(dataloader)), desc='%30s' % model_name)
y_true, y_pred = [], []
for inputs_tensor, labels_tensor in dataloader:
outputs = estimator(**inputs_tensor)
y_label = torch.argmax(outputs.logits, dim=-1)
y_pred.extend(y_label.cpu().numpy().tolist())
y_true.extend(labels_tensor.cpu().numpy().tolist())
progress.update()
progress.close()
print('准确率:', accuracy_score(y_true, y_pred))
precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred)
print('精确率:', precision)
print('召回率:', recall)
print('F-score:', f_score)
print('-' * 100)
def eval_albert():
model_names = glob.glob('finish/intention/albert/*intention_albert*')
for model_name in model_names:
eval_model(model_name)
def predict(inputs):
model_save_path = 'albert_model/epoch_36_simcse_loss_4.5393'
tokenizer = BertTokenizer.from_pretrained(model_save_path)
estimator = AlbertForSequenceClassification.from_pretrained(model_save_path, num_labels=2).eval()
# 对输入处理编码
inputs = tokenizer.encode_plus(inputs,
return_token_type_ids=False,
return_attention_mask=False,
return_tensors='pt')
# 模型预测
with torch.no_grad():
outputs = estimator(**inputs)
y_pred = torch.argmax(outputs.logits)
if y_pred.item() == 1:
print('\033[31m需要处理的问题\033[m')
else:
print('其他方面的问题')
def test():
predict('为什么抽完血后会出现头晕、四肢无力脸色发白冒汗等现象?')
predict('我是怀孕了吗')
predict('嗓子起疮,这是什么原因导致的?')
predict('哈哈')
predict('我滴妈呀,你真笨啊')
predict('前天早上在医院的广场上玩篮球,一会来了几个病人,我们就一起玩了')
predict('我是医院的病人,我发烧了,所以在这里住院')
if __name__ == '__main__':
# train_albert()
eval_albert()
# test()
各个模型的评估结果为:
7_intention_albert_loss_308.1043: 100%|█████████| 17/17 [00:02<00:00, 8.49it/s]
准确率: 0.9884504331087585
精确率: [0.98823529 0.98869476]
召回率: [0.9900272 0.98666667]
F-score: [0.98913043 0.98767967]
----------------------------------------------------------------------------------------------------
8_intention_albert_loss_231.9439: 100%|█████████| 17/17 [00:01<00:00, 10.74it/s]
准确率: 0.9884504331087585
精确率: [0.99090082 0.98569969]
召回率: [0.98730734 0.98974359]
F-score: [0.98910082 0.9877175 ]
----------------------------------------------------------------------------------------------------
9_intention_albert_loss_186.2181: 100%|█████████| 17/17 [00:01<00:00, 10.97it/s]
准确率: 0.9908565928777671
精确率: [0.996337 0.98478702]
召回率: [0.98640073 0.99589744]
F-score: [0.99134396 0.99031107]
----------------------------------------------------------------------------------------------------
10_intention_albert_loss_155.6420: 100%|████████| 17/17 [00:01<00:00, 10.81it/s]
准确率: 0.9846005774783445
精确率: [0.98026906 0.98961578]
召回率: [0.99093382 0.9774359 ]
F-score: [0.98557259 0.98348813]
----------------------------------------------------------------------------------------------------
11_intention_albert_loss_133.5007: 100%|████████| 17/17 [00:01<00:00, 10.65it/s]
准确率: 0.9898941289701636
精确率: [0.99271403 0.98673469]
召回率: [0.98821396 0.99179487]
F-score: [0.99045888 0.98925831]
----------------------------------------------------------------------------------------------------
12_intention_albert_loss_92.2596: 100%|█████████| 17/17 [00:01<00:00, 10.57it/s]
准确率: 0.987487969201155
精确率: [0.98910082 0.98567042]
召回率: [0.98730734 0.98769231]
F-score: [0.98820327 0.98668033]
----------------------------------------------------------------------------------------------------
13_intention_albert_loss_80.6847: 100%|█████████| 17/17 [00:01<00:00, 10.55it/s]
准确率: 0.987487969201155
精确率: [0.98910082 0.98567042]
召回率: [0.98730734 0.98769231]
F-score: [0.98820327 0.98668033]
----------------------------------------------------------------------------------------------------
14_intention_albert_loss_67.7850: 100%|█████████| 17/17 [00:01<00:00, 10.59it/s]
准确率: 0.9879692011549567
精确率: [0.98646209 0.98969072]
召回率: [0.99093382 0.98461538]
F-score: [0.9886929 0.98714653]
----------------------------------------------------------------------------------------------------
15_intention_albert_loss_61.4615: 100%|█████████| 17/17 [00:01<00:00, 10.62it/s]
准确率: 0.9879692011549567
精确率: [0.99 0.98568507]
召回率: [0.98730734 0.98871795]
F-score: [0.98865184 0.98719918]
----------------------------------------------------------------------------------------------------
16_intention_albert_loss_39.3099: 100%|█████████| 17/17 [00:01<00:00, 10.44it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
17_intention_albert_loss_28.6968: 100%|█████████| 17/17 [00:01<00:00, 10.65it/s]
准确率: 0.9884504331087585
精确率: [0.99000908 0.98669396]
召回率: [0.98821396 0.98871795]
F-score: [0.98911071 0.98770492]
----------------------------------------------------------------------------------------------------
18_intention_albert_loss_24.5714: 100%|█████████| 17/17 [00:01<00:00, 10.63it/s]
准确率: 0.9884504331087585
精确率: [0.99090082 0.98569969]
召回率: [0.98730734 0.98974359]
F-score: [0.98910082 0.9877175 ]
----------------------------------------------------------------------------------------------------
19_intention_albert_loss_21.6295: 100%|█████████| 17/17 [00:01<00:00, 10.49it/s]
准确率: 0.9884504331087585
精确率: [0.99090082 0.98569969]
召回率: [0.98730734 0.98974359]
F-score: [0.98910082 0.9877175 ]
----------------------------------------------------------------------------------------------------
20_intention_albert_loss_19.8042: 100%|█████████| 17/17 [00:01<00:00, 10.60it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
21_intention_albert_loss_18.0605: 100%|█████████| 17/17 [00:01<00:00, 10.35it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
22_intention_albert_loss_16.5603: 100%|█████████| 17/17 [00:01<00:00, 10.61it/s]
准确率: 0.9884504331087585
精确率: [0.98823529 0.98869476]
召回率: [0.9900272 0.98666667]
F-score: [0.98913043 0.98767967]
----------------------------------------------------------------------------------------------------
23_intention_albert_loss_15.6354: 100%|█████████| 17/17 [00:01<00:00, 10.46it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
24_intention_albert_loss_14.7335: 100%|█████████| 17/17 [00:01<00:00, 10.76it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
25_intention_albert_loss_13.9919: 100%|█████████| 17/17 [00:01<00:00, 10.48it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
26_intention_albert_loss_13.3679: 100%|█████████| 17/17 [00:01<00:00, 10.57it/s]
准确率: 0.9894128970163619
精确率: [0.99091735 0.9877175 ]
召回率: [0.98912058 0.98974359]
F-score: [0.99001815 0.98872951]
----------------------------------------------------------------------------------------------------
27_intention_albert_loss_12.8064: 100%|█████████| 17/17 [00:01<00:00, 10.68it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
28_intention_albert_loss_12.2769: 100%|█████████| 17/17 [00:01<00:00, 10.65it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
29_intention_albert_loss_11.7247: 100%|█████████| 17/17 [00:01<00:00, 10.69it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
30_intention_albert_loss_11.1622: 100%|█████████| 17/17 [00:01<00:00, 10.39it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
- 基于朴素贝叶斯 {#title-1} =====================
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from datasets import load_from_disk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import pickle
import jieba.posseg as psg
import jieba
jieba.setLogLevel(0)
def cut_word(sentence):
allow = ['n', 'nr', 'ns', 'nt', 'nl', 'nz', 'nsf', 's'] + ['v', 'vd', 'vn', 'vx'] + ['a', 'ad', 'al', 'an']
stopwords = [word.strip() for word in open('file/stopwords.txt')]
sentence_words = []
sentence = psg.lcut(sentence)
for word, pos in sentence:
if pos not in allow:
continue
if word in stopwords:
continue
sentence_words.append(word)
return ' '.join(sentence_words)
def train_vectorizer():
questions = load_from_disk('data/intention.data')['train']
questions = [cut_word(question) for question in questions['title']]
tokenizer = CountVectorizer(max_features=21246)
tokenizer.fit(questions)
print('特征数:', len(tokenizer.get_feature_names_out()))
pickle.dump(tokenizer, open('finish/intention/bayes/vectorizer.pkl', 'wb'))
def train_bayes_model():
vectorizer = pickle.load(open('finish/intention/bayes/vectorizer.pkl', 'rb'))
questions = load_from_disk('data/intention.data')['train']
inputs = [cut_word(title) for title in questions['title']]
labels = questions['label']
inputs = vectorizer.transform(inputs)
estimator = MultinomialNB()
estimator.fit(inputs, labels)
pickle.dump(estimator, open('finish/intention/bayes/bayes.pkl', 'wb'))
def eval_bayes_model():
vectorizer = pickle.load(open('finish/intention/bayes/vectorizer.pkl', 'rb'))
estimator = pickle.load(open('finish/intention/bayes/bayes.pkl', 'rb'))
questions = load_from_disk('data/intention.data')['test']
inputs = [cut_word(question) for question in questions['title']]
labels = questions['label']
inputs = vectorizer.transform(inputs)
ypreds = estimator.predict(inputs)
precision, recall, f_score, true_sum = precision_recall_fscore_support(labels, ypreds)
print('准确率:', accuracy_score(labels, ypreds))
print('精确率:', precision)
print('召回率:', recall)
print('F-score:', f_score)
if __name__ == '__main__':
train_vectorizer()
train_bayes_model()
eval_bayes_model()
评估的结果为:
准确率: 0.9701636188642926
精确率: [0.99430199 0.94536585]
召回率: [0.94922937 0.99384615]
F-score: [0.97124304 0.969 ]
- 基于支持向量机 {#title-2} =====================
import pickle
from sklearn.svm import SVC
from datasets import load_from_disk
import jieba
jieba.setLogLevel(0)
import fasttext
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import datasets
datasets.disable_progress_bar()
def train_svm():
traindata = load_from_disk('data/intention.data')['train']
tokenizer = fasttext.load_model('pretrained/cc.zh.300.bin')
def collate_function(batch_data):
titles = batch_data['title']
labels = batch_data['label']
model_inputs = []
for title in titles:
inputs = tokenizer.get_sentence_vector(' '.join(jieba.lcut(title)))
model_inputs.append(inputs.tolist())
return {'title': model_inputs, 'label': labels}
# 数据向量化
traindata = traindata.map(collate_function, batched=True, batch_size=32)
# 训练支持向量机
estimator = SVC()
estimator.fit(traindata['title'], traindata['label'])
# 存储模型
pickle.dump(estimator, open('finish/intention/svm/svm.pkl', 'wb'))
def eval_svm():
estimator = pickle.load(open('finish/intention/svm/svm.pkl', 'rb'))
tokenizer = fasttext.load_model('pretrained/cc.zh.300.bin')
traindata = load_from_disk('data/intention.data')
def collate_function(batch_data):
titles = batch_data['title']
labels = batch_data['label']
model_inputs = []
for title in titles:
inputs = tokenizer.get_sentence_vector(' '.join(jieba.lcut(title)))
model_inputs.append(inputs.tolist())
return {'title': model_inputs, 'label': labels}
traindata = traindata.map(collate_function, batched=True, batch_size=32)
# 训练集准确率
y_pred = estimator.predict(traindata['train']['title'])
y_true = traindata['train']['label']
print('准确率:', accuracy_score(y_true, y_pred))
precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred)
print('精确率:', precision)
print('召回率:', recall)
print('F-score:', f_score)
print('-' * 50)
# 测试集准确率
y_pred = estimator.predict(traindata['test']['title'])
y_true = traindata['test']['label']
print('测试集:', accuracy_score(y_true, y_pred))
precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred)
print('精确率:', precision)
print('召回率:', recall)
print('F-score:', f_score)
if __name__ == '__main__':
train_svm()
eval_svm()
评估结果:
训练集: 0.9879503698401336
(array([0.99103551, 0.98462111]), array([0.98582371, 0.99027067]), array([0.98842274, 0.98743781]), array([8747, 8017]))
测试集: 0.9880725190839694
(array([0.98912551, 0.98690176]), array([0.98822997, 0.98789713]), array([0.98867754, 0.98739919]), array([2209, 1983]))