Module `media_analyzer.analyzers.sentiment_model.data`

Expand source code

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import re


class TwitterSentimentDataset(Dataset):
    def __init__(self, text, polarity=None, max_len=64, model_name="bert-base-uncased"):
        self.text = text
        self.polarity = polarity
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        input = self.tokenizer.encode_plus(
            text=self.text[index],
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
            truncation=True,
            return_attention_mask=True,
        )
        if self.polarity is not None:
            output = self.polarity[index]
            return (
                torch.LongTensor(input["input_ids"]),
                torch.LongTensor(input["attention_mask"]),
                torch.FloatTensor(output),
            )
        else:
            return torch.LongTensor(input["input_ids"]), torch.LongTensor(
                input["attention_mask"]
            )


def data_preprocess(df, text_col, label_col, num_labels, label_encodings=None):
    if label_encodings is not None:
        df[label_col] = df[label_col].apply(lambda x: label_encodings[x])
    df[text_col] = df[text_col].apply(lambda x: re.sub(r"@\w*", "", str(x)).strip())

    def build_list(x):
        res = [0 for i in range(num_labels)]
        res[x] = 1.0
        return res

    df[label_col] = df[label_col].apply(lambda x: build_list(x))
    return df


def split_data(df):
    train, val = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
    return train, val


def prep_data(df, model_name):
    dataset = TwitterSentimentDataset(
        df["text"].tolist(), df["sentiment"].tolist(), model_name=model_name
    )
    dataset = DataLoader(
        dataset,
        batch_size=64,
        num_workers=2,
        shuffle=True,
        pin_memory=False,
        drop_last=False,
    )
    return dataset

Functions

def data_preprocess(df, text_col, label_col, num_labels, label_encodings=None)

Expand source code

def data_preprocess(df, text_col, label_col, num_labels, label_encodings=None):
    if label_encodings is not None:
        df[label_col] = df[label_col].apply(lambda x: label_encodings[x])
    df[text_col] = df[text_col].apply(lambda x: re.sub(r"@\w*", "", str(x)).strip())

    def build_list(x):
        res = [0 for i in range(num_labels)]
        res[x] = 1.0
        return res

    df[label_col] = df[label_col].apply(lambda x: build_list(x))
    return df

def prep_data(df, model_name)

Expand source code

def prep_data(df, model_name):
    dataset = TwitterSentimentDataset(
        df["text"].tolist(), df["sentiment"].tolist(), model_name=model_name
    )
    dataset = DataLoader(
        dataset,
        batch_size=64,
        num_workers=2,
        shuffle=True,
        pin_memory=False,
        drop_last=False,
    )
    return dataset

def split_data(df)

Expand source code

def split_data(df):
    train, val = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
    return train, val

Classes

class TwitterSentimentDataset (text, polarity=None, max_len=64, model_name='bert-base-uncased')

An abstract class representing a :class:Dataset.

All datasets that represent a map from keys to data samples should subclass it. All subclasses should overwrite :meth:__getitem__, supporting fetching a data sample for a given key. Subclasses could also optionally overwrite :meth:__len__, which is expected to return the size of the dataset by many :class:~torch.utils.data.Sampler implementations and the default options of :class:~torch.utils.data.DataLoader.

Note

:class:~torch.utils.data.DataLoader by default constructs a index sampler that yields integral indices. To make it work with a map-style dataset with non-integral indices/keys, a custom sampler must be provided.

Expand source code

class TwitterSentimentDataset(Dataset):
    def __init__(self, text, polarity=None, max_len=64, model_name="bert-base-uncased"):
        self.text = text
        self.polarity = polarity
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        input = self.tokenizer.encode_plus(
            text=self.text[index],
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
            truncation=True,
            return_attention_mask=True,
        )
        if self.polarity is not None:
            output = self.polarity[index]
            return (
                torch.LongTensor(input["input_ids"]),
                torch.LongTensor(input["attention_mask"]),
                torch.FloatTensor(output),
            )
        else:
            return torch.LongTensor(input["input_ids"]), torch.LongTensor(
                input["attention_mask"]
            )

Ancestors

torch.utils.data.dataset.Dataset
typing.Generic