add drug parser; add iha estimation rules

This commit is contained in:
ipu 2025-07-31 22:08:02 +03:00
parent bf1d988d36
commit 80916f6c3e
10 changed files with 1271 additions and 15 deletions

116
src/drug_price_parser.py Normal file
View file

@ -0,0 +1,116 @@
"""
Drug Price Parser for Drugs.com
This module provides functionality to scrape drug pricing information
from Drugs.com and return it in JSON format.
"""
import json
import re
import time
from typing import Dict, List, Optional, Any
from urllib.parse import quote_plus
import requests
from bs4 import BeautifulSoup
class DrugPriceParser:
"""Parser for extracting drug pricing information from Drugs.com"""
BASE_URL = "https://www.drugs.com/price-guide"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Sec-GPC': '1',
'Connection': 'keep-alive',
'Cookie': 'ddc-pvc=8; ddcsubscribe=disabled',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Priority': 'u=0, i',
'TE': 'trailers'
})
def get_drug_prices(self, drug_name: str) -> Dict[str, Any]:
"""
Get pricing information for a specific drug.
Args:
drug_name: Name of the drug (e.g., 'alprazolam')
Returns:
Dictionary containing pricing information in JSON format
"""
url = f"{self.BASE_URL}/{drug_name.lower()}#prices"
user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0'
self.session.headers.update({'User-Agent': user_agent})
response = self.session.get(url, timeout=15)
soup = BeautifulSoup(response.content, 'html.parser')
prices_data = self._extract_prices(soup, drug_name)
return {
"drug_name": drug_name,
"url": url,
"prices": prices_data,
"status": "success"
}
def _extract_prices(self, soup: BeautifulSoup, drug_name: str) -> Dict[str, Any]:
"""Extract pricing information from the parsed HTML"""
prices_data = {
"formulations": [],
}
div_content = soup.find('div', {'id': 'content'})
formulations = div_content.find_all('h3')
for formulation in formulations:
if formulation.get('class'):
break
formulation_name = formulation.get_text()
formulation_data = {
"name": formulation_name.rstrip(),
"dosages": []
}
dosages_table = formulation.find_next('div')
dosages = dosages_table.find_all('details')
for dosage in dosages:
summary = dosage.find('summary')
spans = summary.find_all('span')
dosage_name = spans[0].find('b').get_text()
dosage_price = spans[1].find_next('b').get_text()
formulation_data["dosages"].append({
"name": dosage_name.rstrip(),
"price": float(dosage_price.rstrip().replace('$', '').replace(',', ''))
})
prices_data["formulations"].append(formulation_data)
return prices_data
def parse_drug_prices(drug_name: str) -> str:
parser = DrugPriceParser()
result = parser.get_drug_prices(drug_name)
return json.dumps(result, indent=2)
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
drug_name = sys.argv[1]
result = parse_drug_prices(drug_name)
print(result)
else:
print("Usage: python drug_price_parser.py <drug_name>")
print("Example: python drug_price_parser.py alprazolam")