TextClassifier.py
Toggle Theme
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import logging
# Configure basic logging for demonstration
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
class TextClassifier:
"""
A simple text classification pipeline.
"""
def __init__(self):
self.vectorizer = CountVectorizer()
self.model = LogisticRegression(max_iter=1000)
logging.info("TextClassifier initialized.")
def preprocess_text(self, text: str) -> str:
"""
Cleans and normalizes a single text string.
- Lowercases text
- Removes non-alphanumeric characters (keeping spaces)
"""
logging.debug(f"Preprocessing text: '{text}'")
text = text.lower()
non_alphabetical_characters = r"[^a-z\s]"
text = re.sub(non_alphabetical_characters, "", text)
text = " ".join(text.split())
logging.debug(f"Preprocessed text: '{text}'")
return text
def train(self, texts: list[str], labels: list[str]):
"""
Trains the classification model.
"""
logging.info("Starting model training.")
# Preprocess all texts
processed_texts = [self.preprocess_text(text) for text in texts]
# Fit vectorizer and transform texts
X = self.vectorizer.fit_transform(processed_texts)
y = labels
# Train the model
self.model.fit(X, y)
logging.info("Model training completed.")
def predict(self, texts: list[str]) -> list[str]:
"""
Makes predictions on new text data.
"""
logging.info("Starting prediction.")
# Preprocess new texts
processed_texts = [self.preprocess_text(text) for text in texts]
# Transform texts using the fitted vectorizer
X_new = self.vectorizer.transform(processed_texts)
# Make predictions
predictions = self.model.predict(X_new).tolist()
logging.info("Prediction completed.")
return predictions
def evaluate(self, texts: list[str], true_labels: list[str]) -> float:
"""
Evaluates the model's accuracy.
"""
logging.info("Starting model evaluation.")
predictions = self.predict(texts)
score = accuracy_score(true_labels, predictions)
logging.info(f"Model accuracy: {score:.4f}")
return score