add drug parser; add iha estimation rules
This commit is contained in:
parent
bf1d988d36
commit
80916f6c3e
10 changed files with 1271 additions and 15 deletions
116
src/drug_price_parser.py
Normal file
116
src/drug_price_parser.py
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
"""
|
||||
Drug Price Parser for Drugs.com
|
||||
|
||||
This module provides functionality to scrape drug pricing information
|
||||
from Drugs.com and return it in JSON format.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, List, Optional, Any
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class DrugPriceParser:
|
||||
"""Parser for extracting drug pricing information from Drugs.com"""
|
||||
|
||||
BASE_URL = "https://www.drugs.com/price-guide"
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||||
'Sec-GPC': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Cookie': 'ddc-pvc=8; ddcsubscribe=disabled',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Priority': 'u=0, i',
|
||||
'TE': 'trailers'
|
||||
})
|
||||
|
||||
def get_drug_prices(self, drug_name: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get pricing information for a specific drug.
|
||||
|
||||
Args:
|
||||
drug_name: Name of the drug (e.g., 'alprazolam')
|
||||
|
||||
Returns:
|
||||
Dictionary containing pricing information in JSON format
|
||||
"""
|
||||
url = f"{self.BASE_URL}/{drug_name.lower()}#prices"
|
||||
user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0'
|
||||
|
||||
self.session.headers.update({'User-Agent': user_agent})
|
||||
|
||||
response = self.session.get(url, timeout=15)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
prices_data = self._extract_prices(soup, drug_name)
|
||||
|
||||
return {
|
||||
"drug_name": drug_name,
|
||||
"url": url,
|
||||
"prices": prices_data,
|
||||
"status": "success"
|
||||
}
|
||||
|
||||
|
||||
def _extract_prices(self, soup: BeautifulSoup, drug_name: str) -> Dict[str, Any]:
|
||||
"""Extract pricing information from the parsed HTML"""
|
||||
|
||||
prices_data = {
|
||||
"formulations": [],
|
||||
}
|
||||
div_content = soup.find('div', {'id': 'content'})
|
||||
formulations = div_content.find_all('h3')
|
||||
for formulation in formulations:
|
||||
if formulation.get('class'):
|
||||
break
|
||||
formulation_name = formulation.get_text()
|
||||
formulation_data = {
|
||||
"name": formulation_name.rstrip(),
|
||||
"dosages": []
|
||||
}
|
||||
dosages_table = formulation.find_next('div')
|
||||
dosages = dosages_table.find_all('details')
|
||||
for dosage in dosages:
|
||||
summary = dosage.find('summary')
|
||||
spans = summary.find_all('span')
|
||||
dosage_name = spans[0].find('b').get_text()
|
||||
dosage_price = spans[1].find_next('b').get_text()
|
||||
formulation_data["dosages"].append({
|
||||
"name": dosage_name.rstrip(),
|
||||
"price": float(dosage_price.rstrip().replace('$', '').replace(',', ''))
|
||||
})
|
||||
prices_data["formulations"].append(formulation_data)
|
||||
return prices_data
|
||||
|
||||
|
||||
def parse_drug_prices(drug_name: str) -> str:
|
||||
parser = DrugPriceParser()
|
||||
result = parser.get_drug_prices(drug_name)
|
||||
return json.dumps(result, indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
drug_name = sys.argv[1]
|
||||
result = parse_drug_prices(drug_name)
|
||||
print(result)
|
||||
else:
|
||||
print("Usage: python drug_price_parser.py <drug_name>")
|
||||
print("Example: python drug_price_parser.py alprazolam")
|
||||
Loading…
Add table
Add a link
Reference in a new issue