virgil-ai/src/drug_price_parser.py

from typing import List

import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel


class Dosage(BaseModel):
    name: str
    price: float


class Formulation(BaseModel):
    name: str
    dosages: List[Dosage]


class DrugPriceResponse(BaseModel):
    drug_name: str
    url: str
    prices: List[Formulation]
    status: str


class DrugPriceParser:
    """Parser for extracting drug pricing information from Drugs.com"""

    BASE_URL = "https://www.drugs.com/price-guide"

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br, zstd',
            'Sec-GPC': '1',
            'Connection': 'keep-alive',
            'Cookie': 'ddc-pvc=8; ddcsubscribe=disabled',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Priority': 'u=0, i',
            'TE': 'trailers'
        })

    def get_drug_prices(self, drug_name: str) -> DrugPriceResponse:
        """
        Get pricing information for a specific drug.

        Args:
            drug_name: Name of the drug (e.g., 'alprazolam')

        Returns:
            DrugPriceResponse containing pricing information
        """
        url = f"{self.BASE_URL}/{drug_name.lower()}#prices"
        user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0'

        self.session.headers.update({'User-Agent': user_agent})

        response = self.session.get(url, timeout=15)
        soup = BeautifulSoup(response.content, 'html.parser')

        prices_data = self._extract_prices(soup, drug_name)

        return DrugPriceResponse(
            drug_name=drug_name,
            url=url,
            prices=prices_data,
            status="success"
        )

    def _extract_prices(self, soup: BeautifulSoup, drug_name: str) -> List[Formulation]:
        """Extract pricing information from the parsed HTML"""

        formulations = []
        div_content = soup.find('div', {'id': 'content'})
        formulation_elements = div_content.find_all('h3')
        for formulation in formulation_elements:
            if formulation.get('class'):
                break
            formulation_name = formulation.get_text()
            dosages = []
            dosages_table = formulation.find_next('div')
            dosage_elements = dosages_table.find_all('details')
            for dosage in dosage_elements:
                quantity_table = dosage.find('table')
                quantity_row = quantity_table.find_all('td', {'class': 'ddc-text-right'})
                price_per_unit = quantity_row[0].get_text()
                price_per_unit = price_per_unit.split(" ")[0]

                summary = dosage.find('summary')
                spans = summary.find_all('span')
                dosage_name = spans[0].find('b').get_text()
                # dosage_price = spans[1].find_next('b').get_text()
                dosages.append(Dosage(
                    name=dosage_name.rstrip(),
                    price=float(price_per_unit.rstrip().replace('$', '').replace(',', ''))
                ))
            formulations.append(Formulation(
                name=formulation_name.rstrip(),
                dosages=dosages
            ))
        return formulations


def parse_drug_prices(drug_name: str) -> str:
    parser = DrugPriceParser()
    result = parser.get_drug_prices(drug_name)
    return result.model_dump_json(indent=2)


if __name__ == "__main__":
    import sys

    if len(sys.argv) > 1:
        drug_name = sys.argv[1]
        result = parse_drug_prices(drug_name)
        print(result)
    else:
        print("Usage: python drug_price_parser.py <drug_name>")
        print("Example: python drug_price_parser.py alprazolam")