51工具盒子

依楼听风雨
笑看云卷云舒,淡观潮起潮落

基于 svm、朴素贝叶斯、albert 文本分类

目标是构建一个基于向量的问答检索系统,即:问题和答案是固定的数据集,通过将用户输入的问题编码为向量,在向量数据库中匹配最相似的问题,并返回问题对应的答案。

实现时,希望能够对输入的问题进行类别判别。例如:我们做法律的问答检索系统,就希望能够判断出用户问题是否和法律相关,如果相关我们再进行后续处理。

训练这样的一个二分类模型使用到的数据为:

  1. 正样本:就是拿到的正常的问题的语料
  2. 负样本:来源就是开放的百度问答数据、以及部分其他行业的问答数据,从中筛选出不同的问题

这种二分类问题,在做的时候方案太多了,这里就使用了三种方法:

  1. 朴素贝叶斯:根据词的频数向量
  2. 支持向量机:用 word2vec 将问题编码为向量
  3. Albert:mini 版本的 bert 预训练模型

从效果来看,朴素贝叶斯使用的高维的稀疏向量,虽然特征维度高,速度也还是可以接受的。其效果不如 albert 和 svm。albert 虽然是 mini 版本,也比另外两个大,并且训练过程产生了很多 checkpoint,需要我们自己来选择用哪个。svm + word2vec 的方式,效果非常不错,训练时间也不长,模型也不大,我比较喜欢这个模型。

  1. 基于 Albert 的方法 {#title-0} ===========================

预训练模型:https://huggingface.co/clue/albert_chinese_tiny

下面完整的实现代码:

from transformers import AlbertForSequenceClassification
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from tqdm import tqdm
from datasets import load_from_disk
import torch.optim as optim
import torch.nn as nn
import glob
import torch
import numpy as np
from datasets import Dataset
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

计算设备

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def collate_function(batch_data, tokenizer):

titles, labels = [], []
for data in batch_data:
    titles.append(data['title'])
    labels.append(data['label'])

title_tensor = tokenizer.batch_encode_plus(titles,
                                           add_special_tokens=True,
                                           padding='longest',
                                           return_tensors='pt')
title_tensor = {key: value.to(device) for key, value in title_tensor.items()}
label_tensor = torch.tensor(labels, device=device)
return title_tensor, label_tensor

def train_albert():

# https://huggingface.co/clue/albert_chinese_tiny
estimator = AlbertForSequenceClassification.from_pretrained('pretrained/albert_chinese_tiny', num_labels=2).to(device)
tokenizer = BertTokenizer.from_pretrained('pretrained/albert_chinese_tiny')
traindata = load_from_disk('data/intention.data')['train']
dataloader = DataLoader(traindata, batch_size=128, shuffle=True, collate_fn=lambda data: collate_function(data, tokenizer))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(estimator.parameters(), lr=1e-5)
scheduler = ReduceLROnPlateau(optimizer, factor=0.6, patience=2, cooldown=2, verbose=True)

for epoch in range(30):

    total_loss, total_size, total_corr = 0.0, 0, 0
    progress = tqdm(range(len(dataloader)))
    for title_tensor, label_tensor in dataloader:

        outputs = estimator(**title_tensor)
        loss = criterion(outputs.logits, label_tensor)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 预测标签
        y_pred = torch.argmax(outputs.logits, dim=-1)
        total_corr += (y_pred == label_tensor).sum().item()
        total_loss += loss.item() * len(label_tensor)
        total_size += len(label_tensor)

        # 更新进度
        desc = '%2d. %6.1f %5d/%5d %.4f %.2E' % (epoch + 1, total_loss, total_corr, total_size, total_corr/total_size, scheduler.optimizer.param_groups[0]['lr'])
        progress.set_description(desc)
        progress.update()

    scheduler.step(total_loss)
    progress.close()

    if epoch > 5:
        model_save_path = 'finish/intention/albert/%0d_intention_albert_loss_%.4f' % (epoch + 1, total_loss)
        estimator.save_pretrained(model_save_path)
        tokenizer.save_pretrained(model_save_path)

@torch.no_grad() def eval_model(model_name):

estimator = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device).eval()
tokenizer = BertTokenizer.from_pretrained(model_name)
traindata = load_from_disk('data/intention.data')['test']
dataloader = DataLoader(traindata, batch_size=128, shuffle=True, collate_fn=lambda data: collate_function(data, tokenizer))

model_name = model_name[model_name.rfind('/') + 1:]
progress = tqdm(range(len(dataloader)), desc='%30s' % model_name)
y_true, y_pred = [], []
for inputs_tensor, labels_tensor in dataloader:
    outputs = estimator(**inputs_tensor)
    y_label = torch.argmax(outputs.logits, dim=-1)
    y_pred.extend(y_label.cpu().numpy().tolist())
    y_true.extend(labels_tensor.cpu().numpy().tolist())
    progress.update()
progress.close()


print('准确率:', accuracy_score(y_true, y_pred))
precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred)
print('精确率:', precision)
print('召回率:', recall)
print('F-score:', f_score)
print('-' * 100)

def eval_albert():

model_names = glob.glob('finish/intention/albert/*intention_albert*')
for model_name in model_names:
    eval_model(model_name)

def predict(inputs): model_save_path = 'albert_model/epoch_36_simcse_loss_4.5393' tokenizer = BertTokenizer.from_pretrained(model_save_path) estimator = AlbertForSequenceClassification.from_pretrained(model_save_path, num_labels=2).eval()

# 对输入处理编码
inputs = tokenizer.encode_plus(inputs,
                               return_token_type_ids=False,
                               return_attention_mask=False,
                               return_tensors='pt')
# 模型预测
with torch.no_grad():
    outputs = estimator(**inputs)
    y_pred = torch.argmax(outputs.logits)

    if y_pred.item() == 1:
        print('\033[31m需要处理的问题\033[m')
    else:
        print('其他方面的问题')

def test(): predict('为什么抽完血后会出现头晕、四肢无力脸色发白冒汗等现象?') predict('我是怀孕了吗') predict('嗓子起疮,这是什么原因导致的?') predict('哈哈') predict('我滴妈呀,你真笨啊') predict('前天早上在医院的广场上玩篮球,一会来了几个病人,我们就一起玩了') predict('我是医院的病人,我发烧了,所以在这里住院')

if name == 'main': # train_albert() eval_albert() # test()

各个模型的评估结果为:

7_intention_albert_loss_308.1043: 100%|█████████| 17/17 [00:02<00:00,  8.49it/s]
准确率: 0.9884504331087585
精确率: [0.98823529 0.98869476]
召回率: [0.9900272  0.98666667]
F-score: [0.98913043 0.98767967]
----------------------------------------------------------------------------------------------------
8_intention_albert_loss_231.9439: 100%|█████████| 17/17 [00:01<00:00, 10.74it/s]
准确率: 0.9884504331087585
精确率: [0.99090082 0.98569969]
召回率: [0.98730734 0.98974359]
F-score: [0.98910082 0.9877175 ]
----------------------------------------------------------------------------------------------------
9_intention_albert_loss_186.2181: 100%|█████████| 17/17 [00:01<00:00, 10.97it/s]
准确率: 0.9908565928777671
精确率: [0.996337   0.98478702]
召回率: [0.98640073 0.99589744]
F-score: [0.99134396 0.99031107]
----------------------------------------------------------------------------------------------------
10_intention_albert_loss_155.6420: 100%|████████| 17/17 [00:01<00:00, 10.81it/s]
准确率: 0.9846005774783445
精确率: [0.98026906 0.98961578]
召回率: [0.99093382 0.9774359 ]
F-score: [0.98557259 0.98348813]
----------------------------------------------------------------------------------------------------
11_intention_albert_loss_133.5007: 100%|████████| 17/17 [00:01<00:00, 10.65it/s]
准确率: 0.9898941289701636
精确率: [0.99271403 0.98673469]
召回率: [0.98821396 0.99179487]
F-score: [0.99045888 0.98925831]
----------------------------------------------------------------------------------------------------
12_intention_albert_loss_92.2596: 100%|█████████| 17/17 [00:01<00:00, 10.57it/s]
准确率: 0.987487969201155
精确率: [0.98910082 0.98567042]
召回率: [0.98730734 0.98769231]
F-score: [0.98820327 0.98668033]
----------------------------------------------------------------------------------------------------
13_intention_albert_loss_80.6847: 100%|█████████| 17/17 [00:01<00:00, 10.55it/s]
准确率: 0.987487969201155
精确率: [0.98910082 0.98567042]
召回率: [0.98730734 0.98769231]
F-score: [0.98820327 0.98668033]
----------------------------------------------------------------------------------------------------
14_intention_albert_loss_67.7850: 100%|█████████| 17/17 [00:01<00:00, 10.59it/s]
准确率: 0.9879692011549567
精确率: [0.98646209 0.98969072]
召回率: [0.99093382 0.98461538]
F-score: [0.9886929  0.98714653]
----------------------------------------------------------------------------------------------------
15_intention_albert_loss_61.4615: 100%|█████████| 17/17 [00:01<00:00, 10.62it/s]
准确率: 0.9879692011549567
精确率: [0.99       0.98568507]
召回率: [0.98730734 0.98871795]
F-score: [0.98865184 0.98719918]
----------------------------------------------------------------------------------------------------
16_intention_albert_loss_39.3099: 100%|█████████| 17/17 [00:01<00:00, 10.44it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
17_intention_albert_loss_28.6968: 100%|█████████| 17/17 [00:01<00:00, 10.65it/s]
准确率: 0.9884504331087585
精确率: [0.99000908 0.98669396]
召回率: [0.98821396 0.98871795]
F-score: [0.98911071 0.98770492]
----------------------------------------------------------------------------------------------------
18_intention_albert_loss_24.5714: 100%|█████████| 17/17 [00:01<00:00, 10.63it/s]
准确率: 0.9884504331087585
精确率: [0.99090082 0.98569969]
召回率: [0.98730734 0.98974359]
F-score: [0.98910082 0.9877175 ]
----------------------------------------------------------------------------------------------------
19_intention_albert_loss_21.6295: 100%|█████████| 17/17 [00:01<00:00, 10.49it/s]
准确率: 0.9884504331087585
精确率: [0.99090082 0.98569969]
召回率: [0.98730734 0.98974359]
F-score: [0.98910082 0.9877175 ]
----------------------------------------------------------------------------------------------------
20_intention_albert_loss_19.8042: 100%|█████████| 17/17 [00:01<00:00, 10.60it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
21_intention_albert_loss_18.0605: 100%|█████████| 17/17 [00:01<00:00, 10.35it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
22_intention_albert_loss_16.5603: 100%|█████████| 17/17 [00:01<00:00, 10.61it/s]
准确率: 0.9884504331087585
精确率: [0.98823529 0.98869476]
召回率: [0.9900272  0.98666667]
F-score: [0.98913043 0.98767967]
----------------------------------------------------------------------------------------------------
23_intention_albert_loss_15.6354: 100%|█████████| 17/17 [00:01<00:00, 10.46it/s]
准确率: 0.9889316650625601
精确率: [0.99090909 0.98670757]
召回率: [0.98821396 0.98974359]
F-score: [0.98955969 0.98822325]
----------------------------------------------------------------------------------------------------
24_intention_albert_loss_14.7335: 100%|█████████| 17/17 [00:01<00:00, 10.76it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
25_intention_albert_loss_13.9919: 100%|█████████| 17/17 [00:01<00:00, 10.48it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
26_intention_albert_loss_13.3679: 100%|█████████| 17/17 [00:01<00:00, 10.57it/s]
准确率: 0.9894128970163619
精确率: [0.99091735 0.9877175 ]
召回率: [0.98912058 0.98974359]
F-score: [0.99001815 0.98872951]
----------------------------------------------------------------------------------------------------
27_intention_albert_loss_12.8064: 100%|█████████| 17/17 [00:01<00:00, 10.68it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
28_intention_albert_loss_12.2769: 100%|█████████| 17/17 [00:01<00:00, 10.65it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
29_intention_albert_loss_11.7247: 100%|█████████| 17/17 [00:01<00:00, 10.69it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
30_intention_albert_loss_11.1622: 100%|█████████| 17/17 [00:01<00:00, 10.39it/s]
准确率: 0.9894128970163619
精确率: [0.99181074 0.98672114]
召回率: [0.98821396 0.99076923]
F-score: [0.99000908 0.98874104]
----------------------------------------------------------------------------------------------------
  1. 基于朴素贝叶斯 {#title-1} =====================
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from datasets import load_from_disk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import pickle
import jieba.posseg as psg
import jieba
jieba.setLogLevel(0)

def cut_word(sentence):

allow = ['n', 'nr', 'ns', 'nt', 'nl', 'nz', 'nsf', 's'] + ['v', 'vd', 'vn', 'vx'] + ['a', 'ad', 'al', 'an']
stopwords = [word.strip() for word in open('file/stopwords.txt')]
sentence_words = []
sentence = psg.lcut(sentence)
for word, pos in sentence:
    if pos not in allow:
        continue
    if word in stopwords:
        continue
    sentence_words.append(word)

return ' '.join(sentence_words)

def train_vectorizer():

questions = load_from_disk('data/intention.data')['train']
questions = [cut_word(question) for question in questions['title']]
tokenizer = CountVectorizer(max_features=21246)
tokenizer.fit(questions)
print('特征数:', len(tokenizer.get_feature_names_out()))
pickle.dump(tokenizer, open('finish/intention/bayes/vectorizer.pkl', 'wb'))

def train_bayes_model():

vectorizer = pickle.load(open('finish/intention/bayes/vectorizer.pkl', 'rb'))
questions = load_from_disk('data/intention.data')['train']
inputs = [cut_word(title) for title in questions['title']]
labels = questions['label']
inputs = vectorizer.transform(inputs)
estimator = MultinomialNB()
estimator.fit(inputs, labels)
pickle.dump(estimator, open('finish/intention/bayes/bayes.pkl', 'wb'))

def eval_bayes_model():

vectorizer = pickle.load(open('finish/intention/bayes/vectorizer.pkl', 'rb'))
estimator = pickle.load(open('finish/intention/bayes/bayes.pkl', 'rb'))
questions = load_from_disk('data/intention.data')['test']
inputs = [cut_word(question) for question in questions['title']]
labels = questions['label']
inputs = vectorizer.transform(inputs)
ypreds = estimator.predict(inputs)

precision, recall, f_score, true_sum = precision_recall_fscore_support(labels, ypreds)
print('准确率:', accuracy_score(labels, ypreds))
print('精确率:', precision)
print('召回率:', recall)
print('F-score:', f_score)

if name == 'main': train_vectorizer() train_bayes_model() eval_bayes_model()

评估的结果为:

准确率: 0.9701636188642926
精确率: [0.99430199 0.94536585]
召回率: [0.94922937 0.99384615]
F-score: [0.97124304 0.969     ]
  1. 基于支持向量机 {#title-2} =====================
import pickle

from sklearn.svm import SVC from datasets import load_from_disk import jieba jieba.setLogLevel(0) import fasttext from sklearn.metrics import accuracy_score from sklearn.metrics import precision_recall_fscore_support import datasets datasets.disable_progress_bar()

def train_svm():

traindata = load_from_disk('data/intention.data')['train']
tokenizer = fasttext.load_model('pretrained/cc.zh.300.bin')

def collate_function(batch_data):
    titles = batch_data['title']
    labels = batch_data['label']
    model_inputs = []
    for title in titles:
        inputs = tokenizer.get_sentence_vector(' '.join(jieba.lcut(title)))
        model_inputs.append(inputs.tolist())
    return {'title': model_inputs, 'label': labels}

# 数据向量化
traindata = traindata.map(collate_function, batched=True, batch_size=32)
# 训练支持向量机
estimator = SVC()
estimator.fit(traindata['title'], traindata['label'])
# 存储模型
pickle.dump(estimator, open('finish/intention/svm/svm.pkl', 'wb'))

def eval_svm():

estimator = pickle.load(open('finish/intention/svm/svm.pkl', 'rb'))
tokenizer = fasttext.load_model('pretrained/cc.zh.300.bin')

traindata = load_from_disk('data/intention.data')
def collate_function(batch_data):
    titles = batch_data['title']
    labels = batch_data['label']
    model_inputs = []
    for title in titles:
        inputs = tokenizer.get_sentence_vector(' '.join(jieba.lcut(title)))
        model_inputs.append(inputs.tolist())
    return {'title': model_inputs, 'label': labels}
traindata = traindata.map(collate_function, batched=True, batch_size=32)

# 训练集准确率
y_pred = estimator.predict(traindata['train']['title'])
y_true = traindata['train']['label']
print('准确率:', accuracy_score(y_true, y_pred))
precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred)
print('精确率:', precision)
print('召回率:', recall)
print('F-score:', f_score)
print('-' * 50)

# 测试集准确率
y_pred = estimator.predict(traindata['test']['title'])
y_true = traindata['test']['label']
print('测试集:', accuracy_score(y_true, y_pred))
precision, recall, f_score, true_sum = precision_recall_fscore_support(y_true, y_pred)
print('精确率:', precision)
print('召回率:', recall)
print('F-score:', f_score)

if name == 'main': train_svm() eval_svm()

评估结果:

训练集: 0.9879503698401336
(array([0.99103551, 0.98462111]), array([0.98582371, 0.99027067]), array([0.98842274, 0.98743781]), array([8747, 8017]))
测试集: 0.9880725190839694
(array([0.98912551, 0.98690176]), array([0.98822997, 0.98789713]), array([0.98867754, 0.98739919]), array([2209, 1983]))

赞(5)
未经允许不得转载:工具盒子 » 基于 svm、朴素贝叶斯、albert 文本分类