virgil-ai/src/drug_price_parser.py

125 lines
No EOL
4.1 KiB
Python

from typing import List
import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel
class Dosage(BaseModel):
name: str
price: float
class Formulation(BaseModel):
name: str
dosages: List[Dosage]
class DrugPriceResponse(BaseModel):
drug_name: str
url: str
prices: List[Formulation]
status: str
class DrugPriceParser:
"""Parser for extracting drug pricing information from Drugs.com"""
BASE_URL = "https://www.drugs.com/price-guide"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Sec-GPC': '1',
'Connection': 'keep-alive',
'Cookie': 'ddc-pvc=8; ddcsubscribe=disabled',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Priority': 'u=0, i',
'TE': 'trailers'
})
def get_drug_prices(self, drug_name: str) -> DrugPriceResponse:
"""
Get pricing information for a specific drug.
Args:
drug_name: Name of the drug (e.g., 'alprazolam')
Returns:
DrugPriceResponse containing pricing information
"""
url = f"{self.BASE_URL}/{drug_name.lower()}#prices"
user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0'
self.session.headers.update({'User-Agent': user_agent})
response = self.session.get(url, timeout=15)
soup = BeautifulSoup(response.content, 'html.parser')
prices_data = self._extract_prices(soup, drug_name)
return DrugPriceResponse(
drug_name=drug_name,
url=url,
prices=prices_data,
status="success"
)
def _extract_prices(self, soup: BeautifulSoup, drug_name: str) -> List[Formulation]:
"""Extract pricing information from the parsed HTML"""
formulations = []
div_content = soup.find('div', {'id': 'content'})
formulation_elements = div_content.find_all('h3')
for formulation in formulation_elements:
if formulation.get('class'):
break
formulation_name = formulation.get_text()
dosages = []
dosages_table = formulation.find_next('div')
dosage_elements = dosages_table.find_all('details')
for dosage in dosage_elements:
quantity_table = dosage.find('table')
quantity_row = quantity_table.find_all('td', {'class': 'ddc-text-right'})
price_per_unit = quantity_row[0].get_text()
price_per_unit = price_per_unit.split(" ")[0]
summary = dosage.find('summary')
spans = summary.find_all('span')
dosage_name = spans[0].find('b').get_text()
# dosage_price = spans[1].find_next('b').get_text()
dosages.append(Dosage(
name=dosage_name.rstrip(),
price=float(price_per_unit.rstrip().replace('$', '').replace(',', ''))
))
formulations.append(Formulation(
name=formulation_name.rstrip(),
dosages=dosages
))
return formulations
def parse_drug_prices(drug_name: str) -> str:
parser = DrugPriceParser()
result = parser.get_drug_prices(drug_name)
return result.model_dump_json(indent=2)
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
drug_name = sys.argv[1]
result = parse_drug_prices(drug_name)
print(result)
else:
print("Usage: python drug_price_parser.py <drug_name>")
print("Example: python drug_price_parser.py alprazolam")