循环神经网络
时间:2023-2-27 23:36 作者:wen 分类: AI
文件下载:online_shopping_10_cats.csv
# shopping_data.py
import os
import keras
import numpy as np
import keras.preprocessing.text as text
import re
import jieba
import random
def load_data():
xs = []
ys = []
with open(os.path.dirname(os.path.abspath(__file__)) + '/online_shopping_10_cats.csv', 'r', encoding='utf-8') as f:
line = f.readline() # escape first line"label review"
while line:
line = f.readline()
if not line:
break
contents = line.split(',')
# if contents[0]=="书籍":
# continue
label = int(contents[1])
review = contents[2]
if len(review) > 20:
continue
xs.append(review)
ys.append(label)
xs = np.array(xs)
ys = np.array(ys)
# 打乱数据集
indies = [i for i in range(len(xs))]
random.seed(666)
random.shuffle(indies)
xs = xs[indies]
ys = ys[indies]
m = len(xs)
cutpoint = int(m * 4 / 5)
x_train = xs[:cutpoint]
y_train = ys[:cutpoint]
x_test = xs[cutpoint:]
y_test = ys[cutpoint:]
print('总样本数量:%d' % (len(xs)))
print('训练集数量:%d' % (len(x_train)))
print('测试集数量:%d' % (len(x_test)))
return x_train, y_train, x_test, y_test
def createWordIndex(x_train, x_test):
x_all = np.concatenate((x_train, x_test), axis=0)
# 建立词索引
tokenizer = text.Tokenizer()
# create word index
word_dic = {}
voca = []
for sentence in x_all:
# 去掉标点
sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", sentence)
# 结巴分词
cut = jieba.cut(sentence)
# cut_list = [ i for i in cut ]
for word in cut:
if not (word in word_dic):
word_dic[word] = 0
else:
word_dic[word] += 1
voca.append(word)
word_dic = sorted(word_dic.items(), key=lambda kv: kv[1], reverse=True)
voca = [v[0] for v in word_dic]
tokenizer.fit_on_texts(voca)
print("voca:" + str(len(voca)))
return len(voca), tokenizer.word_index
def word2Index(words, word_index):
vecs = []
for sentence in words:
# 去掉标点
sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", sentence)
# 结巴分词
cut = jieba.cut(sentence)
# cut_list = [ i for i in cut ]
index = []
for word in cut:
if word in word_index:
index.append(float(word_index[word]))
# if len(index)>25:
# index = index[0:25]
vecs.append(np.array(index))
return np.array(vecs)
# comments_recognizer.py
import shopping_data
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
X_train, Y_train, X_test, Y_test = shopping_data.load_data()
print("X_train.shape:", X_train.shape)
print("Y_train.shape:", Y_train.shape)
print("X_test.shape:", X_test.shape)
print("Y_test.shape:", Y_test.shape)
print(X_train[0])
print(Y_train[0])
# vocalen 这个词典的词汇数量
# word_index 训练集和测试集全部语料的词典
vocalen, word_index = shopping_data.createWordIndex(X_train, X_test)
print(word_index)
print('词典总词数:', vocalen)
# 训练数据的索引表示
X_train_index = shopping_data.word2Index(X_train, word_index)
X_test_index = shopping_data.word2Index(X_test, word_index)
maxlen = 25
# 把序列按照maxlen进行对齐
X_train_index = sequence.pad_sequences(X_train_index, maxlen=maxlen)
X_test_index = sequence.pad_sequences(X_test_index, maxlen=maxlen)
# 神经网络
model = Sequential()
# trainable 是否可训练
# input_dim 输入维度
# output_dim 输出维度
# input_length 句子长度
model.add(Embedding(trainable=True, input_dim=vocalen, output_dim=300, input_length=maxlen))
# 把数据平铺开
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_index, Y_train, batch_size=512, epochs=20)
score, acc = model.evaluate(X_test_index, Y_test)
print("Test score:", score)
print("Test accuracy:", acc)
标签: 人工智能