Module `media_analyzer.analyzers.inference`

Expand source code

import torch
import os
from analyzers.sentiment_model.model import TwitterSentimentModel
from transformers import AutoTokenizer
import numpy as np
import re
from transformers import logging
logging.set_verbosity_error()
model_name = "prajjwal1/bert-mini"
model_dir = os.path.join(
    os.path.dirname(__file__), "sentiment_model/checkpoints/TwitterSentimentModel.pt"
)
print(model_dir)
Sentiment = TwitterSentimentModel(model_name=model_name)
Sentiment.cpu()
# only use cpu if cuda is not available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Sentiment.load_state_dict(torch.load(model_dir, map_location=device))

tokenizer = AutoTokenizer.from_pretrained(model_name)


def make_prediction(data_sample):
    """
    Run the model on a data sample
        - data_sample: A tweet to make inference on
    returns: A dict of prediction (NEGATIVE, NEUTRAL, POSITIVE) and the cleaned text
    """
    data_sample = re.sub(r"@\w*", "", data_sample).strip()
    pair = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
    input = tokenizer.encode_plus(
        text=data_sample,
        add_special_tokens=True,
        padding="max_length",
        max_length=64,
        return_tensors="pt",
        truncation=True,
        return_attention_mask=True,
    )

    preds = (
        Sentiment(input["input_ids"].cpu(), input["attention_mask"].cpu())
        .detach()
        .numpy()
        .ravel()
    )
    p = np.argmax(preds)

    return {"emo": pair[p], "clean-text": data_sample}

Functions

def make_prediction(data_sample)

Run the model on a data sample - data_sample: A tweet to make inference on returns: A dict of prediction (NEGATIVE, NEUTRAL, POSITIVE) and the cleaned text

Expand source code

def make_prediction(data_sample):
    """
    Run the model on a data sample
        - data_sample: A tweet to make inference on
    returns: A dict of prediction (NEGATIVE, NEUTRAL, POSITIVE) and the cleaned text
    """
    data_sample = re.sub(r"@\w*", "", data_sample).strip()
    pair = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
    input = tokenizer.encode_plus(
        text=data_sample,
        add_special_tokens=True,
        padding="max_length",
        max_length=64,
        return_tensors="pt",
        truncation=True,
        return_attention_mask=True,
    )

    preds = (
        Sentiment(input["input_ids"].cpu(), input["attention_mask"].cpu())
        .detach()
        .numpy()
        .ravel()
    )
    p = np.argmax(preds)

    return {"emo": pair[p], "clean-text": data_sample}