LSTM网络:自然语言处理
时间:2023-2-27 23:39 作者:wen 分类: AI
文件下载:online_shopping_10_cats.csv
关于第三方预训练词向量的下载方法.docx
# shopping_data.py
import os
import keras
import numpy as np
import keras.preprocessing.text as text
import re
import jieba
import random
def load_data():
xs = []
ys = []
with open(os.path.dirname(os.path.abspath(__file__)) + '/online_shopping_10_cats.csv', 'r', encoding='utf-8') as f:
line = f.readline() # escape first line"label review"
while line:
line = f.readline()
if not line:
break
contents = line.split(',')
# if contents[0]=="书籍":
# continue
label = int(contents[1])
review = contents[2]
if len(review) > 20:
continue
xs.append(review)
ys.append(label)
xs = np.array(xs)
ys = np.array(ys)
# 打乱数据集
indies = [i for i in range(len(xs))]
random.seed(666)
random.shuffle(indies)
xs = xs[indies]
ys = ys[indies]
m = len(xs)
cutpoint = int(m * 4 / 5)
x_train = xs[:cutpoint]
y_train = ys[:cutpoint]
x_test = xs[cutpoint:]
y_test = ys[cutpoint:]
print('总样本数量:%d' % (len(xs)))
print('训练集数量:%d' % (len(x_train)))
print('测试集数量:%d' % (len(x_test)))
return x_train, y_train, x_test, y_test
def createWordIndex(x_train, x_test):
x_all = np.concatenate((x_train, x_test), axis=0)
# 建立词索引
tokenizer = text.Tokenizer()
# create word index
word_dic = {}
voca = []
for sentence in x_all:
# 去掉标点
sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", sentence)
# 结巴分词
cut = jieba.cut(sentence)
# cut_list = [ i for i in cut ]
for word in cut:
if not (word in word_dic):
word_dic[word] = 0
else:
word_dic[word] += 1
voca.append(word)
word_dic = sorted(word_dic.items(), key=lambda kv: kv[1], reverse=True)
voca = [v[0] for v in word_dic]
tokenizer.fit_on_texts(voca)
print("voca:" + str(len(voca)))
return len(voca), tokenizer.word_index
def word2Index(words, word_index):
vecs = []
for sentence in words:
# 去掉标点
sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", sentence)
# 结巴分词
cut = jieba.cut(sentence)
# cut_list = [ i for i in cut ]
index = []
for word in cut:
if word in word_index:
index.append(float(word_index[word]))
# if len(index)>25:
# index = index[0:25]
vecs.append(np.array(index))
return np.array(vecs)
# chinese_vec.py
import os
import numpy as np
def load_word_vecs():
embeddings_index = {}
f = open(os.path.dirname(os.path.abspath(__file__)) + '/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5',
encoding='utf8')
f.readline() # escape first line
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
return embeddings_index
# comments_lstem.py
import numpy as np
import shopping_data
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, LSTM
import chinese_vec
X_train, Y_train, X_test, Y_test = shopping_data.load_data()
print("X_train.shape:", X_train.shape)
print("Y_train.shape:", Y_train.shape)
print("X_test.shape:", X_test.shape)
print("Y_test.shape:", Y_test.shape)
print(X_train[0])
print(X_test[0])
vocalen, word_index = shopping_data.createWordIndex(X_train, X_test)
print(word_index)
print('词典总词数:', vocalen)
X_train_index = shopping_data.word2Index(X_train, word_index)
X_test_index = shopping_data.word2Index(X_test, word_index)
maxlen = 25
X_train_index = sequence.pad_sequences(X_train_index, maxlen=maxlen)
X_test_index = sequence.pad_sequences(X_test_index, maxlen=maxlen)
# 自行构造嵌入矩阵
word_vecs = chinese_vec.load_word_vecs()
embedding_matrix = np.zeros((vocalen, 300))
for word, i in word_index.items():
embedding_vector = word_vecs.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
model = Sequential()
# trainable 是否可训练
# input_dim 输入维度
# output_dim 输出维度
# input_length 句子长度
model.add(Embedding(trainable=False, input_dim=vocalen, weights=[embedding_matrix], output_dim=300, input_length=maxlen))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_index, Y_train, batch_size=512, epochs=20)
score, acc = model.evaluate(X_test_index, Y_test)
print("Test score:", score)
print("Test accuracy:", acc)
标签: 人工智能