【Python NLTK】文本分类,轻松搞定文本归类难题
from nltk.corpus import stopWords from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer from nltk.classify import NaiveBayesClassifier # 加载数据 data = [("我爱北京", "积极"), ("我讨厌北京", "消极")] # 数据预处理 stop_words = set(stopwords.words("english")) stemmer = PorterStemmer() processed_data = [] for text, label in data: tokens = word_tokenize(text) filtered_tokens = [token for token in tokens if token not in stop_words] stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens] processed_data.append((stemmed_tokens, label)) # 特征提取 all_words = [word for sentence, label in processed_data for word in sentence] word_features = list(set(all_words)) def document_features(document): document_words = set(document) features = {} for word in word_features: features["contains({})".fORMat(word)] = (word in document_words) return features feature_sets = [(document_features(sentence), label) for sentence, label in processed_data] # 模型训练 classifier = NaiveBayesClassifier.train(feature_sets) # 模型评估 print(classifier.accuracy(feature_sets))
登录后复制
-
探索跨空间剪辑技术:突破视觉创作的新可能2024-06-11
-
学习剪辑拍摄技术的重要性2024-06-22
-
如何撰写吸引人的剪辑技术广告语?2024-06-18
-
剪辑高手必备的技术与技巧2024-06-18
-
学会这些技巧,让你的相片剪辑简单又好看2024-06-17