python - Use Logistic Regression for Tweets Topic Classification -
i have problem regarding use of logistic regression. i'm making tweets topic classification in python. far i'm able read train data mysql table using pandas, clean train tweets using nltk , create feature vectors using countvectorizer. here's code below..
import pandas pd sqlalchemy import * nltk.tokenize import regexptokenizer nltk.corpus import stopwords import re nltk.stem import snowballstemmer sklearn.feature_extraction.text import countvectorizer sklearn.linear_model import logisticregression #connect database , training data engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/machinelearning') tweet = pd.read_sql_query('select label, tweets tweetstable', engine, index_col='label') #text preprocessing (remove html markup, remove punctuation, tokenizing, remove stop words, stemming) def preprocessing(pptweets): pptweets = pptweets.lower() urlrtweets = re.sub(r'https:.*$', ":", pptweets) rpptweets = urlrtweets.replace("_", " ") tokenizer = regexptokenizer(r'\w+') tokens = tokenizer.tokenize(rpptweets) filteredwords = [w w in tokens if not w in stopwords.words('english')] stemmer = snowballstemmer("english") stweets = [stemmer.stem(tokens) tokens in filteredwords] return " ".join(stweets) #initialize empty list hold clean reviews cleantweets = [] #loop on each review, create index goes 0 length of tweets list in range(0, len(tweet["tweets"])): cleantweets.append(preprocessing(tweet["tweets"][i])) #initialize "countvectorizer" object, scikit-learn's bow tools vectorizer = countvectorizer(analyzer="word", tokenizer=none, preprocessor=none, stop_words=none, max_features=5000) #fit_transform() 2 functions: first, fits model #and learns vocabulary; second, transforms our training data #into feature vectors. input fit_transform should list of strings traindatafeatures = vectorizer.fit_transform(cleantweets) #numpy arrays easy work with, convert result array traindatafeatures = traindatafeatures.toarray()
the problem i'm facing right is.. don't know how use logistic regression learn train data. here's code use fit train data logistic regression classifier.
#train model logmodel = logisticregression() logmodel.fit(traindatafeatures, tweet["label"]) #check trained model intercept print(logmodel.intercept_) #check trained model coefficients print(logmodel.coef_)
i pass traindatafeatures input x , tweet["label"] label/class y each tweet logistic regression classifier can learn when run full code error this:
traceback (most recent call last): file "c:\users\indra\anaconda3\lib\site-packages\pandas\indexes\base.py", line 1945, in get_loc return self._engine.get_loc(key) file "pandas\index.pyx", line 137, in pandas.index.indexengine.get_loc (pandas\index.c:4154) file "pandas\index.pyx", line 159, in pandas.index.indexengine.get_loc (pandas\index.c:4018) file "pandas\hashtable.pyx", line 675, in pandas.hashtable.pyobjecthashtable.get_item (pandas\hashtable.c:12368) file "pandas\hashtable.pyx", line 683, in pandas.hashtable.pyobjecthashtable.get_item (pandas\hashtable.c:12322) keyerror: 'label'
during handling of above exception, exception occurred:
traceback (most recent call last): file "c:/users/indra/pycharmprojects/textclassifier/textclassifier.py", line 52, in <module> logmodel.fit(traindatafeatures, tweet["label"]) file "c:\users\indra\anaconda3\lib\site-packages\pandas\core\frame.py", line 1997, in __getitem__ return self._getitem_column(key) file "c:\users\indra\anaconda3\lib\site-packages\pandas\core\frame.py", line 2004, in _getitem_column return self._get_item_cache(key) file "c:\users\indra\anaconda3\lib\site-packages\pandas\core\generic.py", line 1350, in _get_item_cache values = self._data.get(item) file "c:\users\indra\anaconda3\lib\site-packages\pandas\core\internals.py", line 3290, in loc = self.items.get_loc(item) file "c:\users\indra\anaconda3\lib\site-packages\pandas\indexes\base.py", line 1947, in get_loc return self._engine.get_loc(self._maybe_cast_indexer(key)) file "pandas\index.pyx", line 137, in pandas.index.indexengine.get_loc (pandas\index.c:4154) file "pandas\index.pyx", line 159, in pandas.index.indexengine.get_loc (pandas\index.c:4018) file "pandas\hashtable.pyx", line 675, in pandas.hashtable.pyobjecthashtable.get_item (pandas\hashtable.c:12368) file "pandas\hashtable.pyx", line 683, in pandas.hashtable.pyobjecthashtable.get_item (pandas\hashtable.c:12322) keyerror: 'label'
can me solve problem? :( i've been searching tutorials haven't found far.
Comments
Post a Comment