%pip install vaderSentiment
%pip install nltk
%pip install fastapi
%pip install pydantic
%pip install uvicorn
%pip install python-multipart

import pandas as pd

pwd

'C:\\Users\\rsb84'

import os

os.chdir(r'C:\Users\rsb84\Downloads\FastAPI\portfolio_project')

with open("Wall Street Rattled Over Worries About A.I. and Inflation.txt", "r", encoding="utf-8") as f:
    text_blob = f.read()

df = pd.DataFrame({"text": [text_blob]})

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rsb84\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

from nltk.tokenize.punkt import PunktSentenceTokenizer

# Train the model
tokenizer = PunktSentenceTokenizer()

extra_abbrevs = {
    # titles / honorifics
    'mr', 'prof', 'sr', 'jr', 'rev', 'hon', 'gov',
    'pres', 'gen', 'col', 'lt', 'cmdr', 'capt', 'sgt', 'dr', 'sen', 'rep', 'ms', 'mrs',  

    # academic / organizational
    'ph.d', 'm.d', 'b.a', 'm.a', 'inc', 'ltd', 'co', 'corp',

    # date / time
    'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 'nov', 'dec',

    # address / locations
    'apt', 'dept', 'no', 'mt', 'ft', 'hwy', 'ave', 'blvd', 'rd', 'st', 

    # places
    'u.s', 'u.s.a', 'd.c', 'u.k', 'e.u', 'u.a.e', 

    # Latin / common shorthand
    'vs', 'al', 'fig', 'eq', 'est', 'cf', 'approx', 'e.g', 'i.e', 'etc', 'p.s', 

    # days
    'mon', 'tue', 'tues', 'wed', 'thu', 'thur', 'thurs', 'fri', 'sat', 'sun',

    # extra words
    'a.i'
}

tokenizer._params.abbrev_types.update(extra_abbrevs)

df.head()

sentences = []
for corpus in df.text:
    clean_corpus = corpus.replace("\n", " ").replace("\r", " ")
    sentences.append(tokenizer.sentences_from_text(clean_corpus))

sentences = pd.DataFrame(sentences, index=None)

sentences.head().map(lambda x: ' '.join(str(x).split()[:3]))

def sentiment_fun (var):
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    sentiment = SentimentIntensityAnalyzer()
    out_score = sentiment.polarity_scores(var)
    return out_score['compound']

vader_sentences = [[sentiment_fun(sentence) for sentence in corpus if sentence is not None] for corpus in sentences.values]

vader_sentences_df = pd.DataFrame(vader_sentences)

vader_sentences_df.shape

(1, 33)

vader_sentences_df.head()

import numpy as np

sentences_vader_means = vader_sentences_df.mean(axis=1, skipna=True)

print(sentences_vader_means)

0    0.000712
dtype: float64

from contextlib import asynccontextmanager
from typing import List, Optional

import numpy as np
import uvicorn
from fastapi import FastAPI, File, HTTPException, UploadFile
from pydantic import BaseModel
from nltk.tokenize.punkt import PunktSentenceTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


# Build tokenizer
def build_tokenizer() -> PunktSentenceTokenizer:
    tokenizer = PunktSentenceTokenizer()

    extra_abbrevs = {
        # titles / honorifics
        "mr", "prof", "sr", "jr", "rev", "hon", "gov",
        "pres", "gen", "col", "lt", "cmdr", "capt", "sgt", "dr", "sen", "rep", "ms", "mrs",

        # academic / organizational
        "ph.d", "m.d", "b.a", "m.a", "inc", "ltd", "co", "corp",

        # date / time
        "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov", "dec",

        # address / locations
        "apt", "dept", "no", "mt", "ft", "hwy", "ave", "blvd", "rd", "st",

        # places
        "u.s", "u.s.a", "d.c", "u.k", "e.u", "u.a.e",

        # Latin / shorthand
        "vs", "al", "fig", "eq", "est", "cf", "approx", "e.g", "i.e", "etc", "p.s",

        # days
        "mon", "tue", "tues", "wed", "thu", "thur", "thurs", "fri", "sat", "sun",
    }

    tokenizer._params.abbrev_types.update(extra_abbrevs)
    return tokenizer


# Load shared objects once at startup
@asynccontextmanager
async def lifespan(app: FastAPI):
    app.state.tokenizer = build_tokenizer()
    app.state.analyzer = SentimentIntensityAnalyzer()
    yield


app = FastAPI(
    title="Sentiment Scoring API",
    description="Upload a .txt file and score its sentiment using VADER.",
    version="1.0.0",
    lifespan=lifespan,
)


# Response model
class SentimentResponse(BaseModel):
    filename: str
    sentence_count: int
    sentence_scores: List[float]
    overall_mean_vader_score: float

# Core scoring function
def score_text_document(
    text: str,
    tokenizer: PunktSentenceTokenizer,
    analyzer: SentimentIntensityAnalyzer,
) -> dict:
    """
    Workflow:
    - sentence tokenize
    - VADER compound score per sentence
    - compute mean of all sentence scores, including 0.0 neutral scores
    """
    if not text or not text.strip():
        raise ValueError("Submitted text is empty.")

    sentences = tokenizer.sentences_from_text(text)

    if not sentences:
        raise ValueError("No sentences could be extracted from the text.")

    scores = []
    for sentence in sentences:
        if sentence is not None:
            compound = analyzer.polarity_scores(sentence)["compound"]
            scores.append(compound)

    if not scores:
        raise ValueError("No sentiment scores could be computed.")

    scores_array = np.array(scores, dtype=float)

    overall_mean_vader_score = float(np.mean(scores_array))

    sentence_scores = [float(score) for score in scores_array]

    return {
        "sentence_count": len(sentences),
        "sentence_scores": sentence_scores,
        "overall_mean_vader_score": overall_mean_vader_score,
    }


# Health check
@app.get("/")
def read_root():
    return {"message": "Sentiment Scoring API is running."}


# FastAPI endpoint for uploaded .txt
@app.post("/predict", response_model=SentimentResponse)
async def predict(file: UploadFile = File(...)):
    if not file.filename:
        raise HTTPException(status_code=400, detail="A file must be provided.")

    if not file.filename.lower().endswith(".txt"):
        raise HTTPException(status_code=400, detail="Only .txt files are allowed.")

    try:
        raw_bytes = await file.read()
        text = raw_bytes.decode("utf-8")
    except UnicodeDecodeError:
        raise HTTPException(
            status_code=400,
            detail="File could not be decoded as UTF-8 text.",
        )
    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail=f"Unexpected error while reading file: {str(e)}",
        )

    try:
        result = score_text_document(
            text=text,
            tokenizer=app.state.tokenizer,
            analyzer=app.state.analyzer,
        )
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail=f"Unexpected error while scoring text: {str(e)}",
        )

    return SentimentResponse(
        filename=file.filename,
        sentence_count=result["sentence_count"],
        sentence_scores=result["sentence_scores"],
        overall_mean_vader_score=result["overall_mean_vader_score"],
    )

# Run locally
if __name__ == "__main__":
    uvicorn.run(
        "main:app",
        host="0.0.0.0",
        port=8080,
        reload=True,
    )

Sentiment Analysis API for Text Files Using FastAPI¶

by Rob Boswell¶

Feb. 2026¶

I will start by loading a newspaper article, then tokenizing it with the NLTK Punkt sentence tokenizer.¶

I will add the following abbreviations to help the Punkt tokenizer improve its accuracy, and help ensure it does not cut off text mid-sentence.¶

For brevity, I will limit the amount of words per sentence to display below to just 3:¶

Since I will be scoring a newspaper article below, I have therefore decided to split the text into sentences, score each sentiment separately, and then take the median non-zero score as the text's overall score.¶

axis=1 means: take the mean across the columns for each row.¶

Based on the code above, the next cell contains the functions that should make up my main.py file.¶

Note how I have designed the code below to enable users to upload their own text files to be scored using the "/predict" endpoint.¶