from pathlib import Path

import nltk.corpus
import onnxruntime as ort
import pandas as pd
import skl2onnx
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

/tmp/ipykernel_2019/3651809897.py:5: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

nltk.download("movie_reviews")
dataset_classes = nltk.corpus.movie_reviews.categories()
dataset = pd.DataFrame(
    [
        {
            "text": nltk.corpus.movie_reviews.raw(fileid),
            "sentiment": fileid.split("/")[0],
        }
        for fileid in nltk.corpus.movie_reviews.fileids()
    ]
)
dataset

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/runner/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.

X = dataset["text"]
y = dataset["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

pipeline = Pipeline(
    [
        ("tf-idf", TfidfVectorizer()),
        ("classifier", LogisticRegression()),
    ]
)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('tf-idf', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

Pipeline(steps=[('tf-idf', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

TfidfVectorizer()

LogisticRegression()

pipeline.score(X_test, y_test)

0.848

pipeline.predict(["a nice and good take"])

array(['pos'], dtype=object)

pipeline.predict(["it hurts so bad"])

array(['neg'], dtype=object)

onnx_options = {id(pipeline): {"zipmap": False, "output_class_labels": True}}
onnx_model = skl2onnx.to_onnx(pipeline, X_train[:1].values, options=onnx_options)
onnx_model_path = Path() / "model.onnx"
onnx_model_path.write_bytes(onnx_model.SerializeToString())

1130676

session = ort.InferenceSession(onnx_model_path, providers=ort.get_available_providers())
session.run(None, {"X": ["it hurts so bad"]})

[array(['neg'], dtype=object),
 array([[0.76245755, 0.23754245]], dtype=float32),
 array(['neg', 'pos'], dtype=object)]

	text	sentiment
0	plot : two teen couples go to a church party ,...	neg
1	the happy bastard's quick movie review \ndamn ...	neg
2	it is movies like these that make a jaded movi...	neg
3	" quest for camelot " is warner bros . ' firs...	neg
4	synopsis : a mentally unstable man undergoing ...	neg
...	...	...
1995	wow ! what a movie . \nit's everything a movie...	pos
1996	richard gere can be a commanding actor , but h...	pos
1997	glory--starring matthew broderick , denzel was...	pos
1998	steven spielberg's second epic film on world w...	pos
1999	truman ( " true-man " ) burbank is the perfect...	pos

Sentiment analysis model training¶