| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | import subprocess |
| | import sys |
| |
|
| | def install_packages(): |
| | """Install required packages""" |
| | packages = ['openai', 'gradio', 'python-dotenv', 'requests', 'pandas'] |
| | for package in packages: |
| | try: |
| | __import__(package) |
| | except ImportError: |
| | print(f"Installing {package}...") |
| | subprocess.check_call([sys.executable, "-m", "pip", "install", package]) |
| |
|
| | |
| | install_packages() |
| |
|
| | |
| | import gradio as gr |
| | import json |
| | import random |
| | import re |
| | import time |
| | import os |
| | import io |
| | import zipfile |
| | from datetime import datetime |
| | from typing import Dict, List, Any, Optional, Tuple |
| | from openai import OpenAI |
| | import pandas as pd |
| |
|
| | |
| | |
| | |
| |
|
| | class MedicalLiteratureSimulator: |
| | """Simulates medical literature research for health dataset generation""" |
| | |
| | def __init__(self): |
| | self.research_domains = { |
| | "longevity": { |
| | "interventions": ["NAD+ supplementation", "resveratrol", "metformin", "caloric restriction"], |
| | "biomarkers": ["telomere length", "cellular senescence", "inflammatory markers", "mitochondrial function"], |
| | "outcomes": ["biological age reduction", "improved healthspan", "enhanced cellular repair"] |
| | }, |
| | "metabolic_health": { |
| | "interventions": ["berberine", "intermittent fasting", "alpha-lipoic acid", "chromium"], |
| | "biomarkers": ["glucose levels", "insulin sensitivity", "HbA1c", "HOMA-IR"], |
| | "outcomes": ["improved glucose control", "enhanced insulin sensitivity", "reduced inflammation"] |
| | }, |
| | "cardiovascular": { |
| | "interventions": ["omega-3 fatty acids", "coenzyme Q10", "magnesium", "nattokinase"], |
| | "biomarkers": ["blood pressure", "cholesterol levels", "CRP", "endothelial function"], |
| | "outcomes": ["reduced blood pressure", "improved lipid profile", "decreased inflammation"] |
| | }, |
| | "cognitive": { |
| | "interventions": ["lion's mane mushroom", "phosphatidylserine", "bacopa monnieri", "acetyl-L-carnitine"], |
| | "biomarkers": ["cognitive performance", "BDNF levels", "neuroinflammation", "memory function"], |
| | "outcomes": ["enhanced memory", "improved cognitive function", "neuroprotection"] |
| | }, |
| | "hormonal": { |
| | "interventions": ["ashwagandha", "vitamin D", "DHEA", "maca root"], |
| | "biomarkers": ["cortisol levels", "thyroid hormones", "sex hormones", "stress markers"], |
| | "outcomes": ["hormone balance", "improved energy", "better sleep quality"] |
| | }, |
| | "inflammation": { |
| | "interventions": ["curcumin", "omega-3", "quercetin", "boswellia"], |
| | "biomarkers": ["CRP", "IL-6", "TNF-alpha", "oxidative stress"], |
| | "outcomes": ["reduced inflammation", "improved immune function", "enhanced recovery"] |
| | } |
| | } |
| | |
| | def generate_study_data(self, domain: str) -> Dict[str, Any]: |
| | """Generate realistic medical study data""" |
| | if domain not in self.research_domains: |
| | domain = "longevity" |
| | |
| | domain_data = self.research_domains[domain] |
| | |
| | study = { |
| | "pmid": f"PMID{random.randint(35000000, 40000000)}", |
| | "title": self._generate_study_title(domain, domain_data), |
| | "abstract": self._generate_study_abstract(domain, domain_data), |
| | "journal": random.choice([ |
| | "Nature Medicine", "Cell Metabolism", "Journal of Clinical Medicine", |
| | "Circulation", "Aging Cell", "Nutrients", "Clinical Nutrition" |
| | ]), |
| | "year": random.choice([2023, 2024]), |
| | "domain": domain, |
| | "interventions": random.sample(domain_data["interventions"], min(2, len(domain_data["interventions"]))), |
| | "biomarkers": random.sample(domain_data["biomarkers"], min(3, len(domain_data["biomarkers"]))), |
| | "outcomes": random.sample(domain_data["outcomes"], min(2, len(domain_data["outcomes"]))), |
| | "participant_count": random.randint(50, 300), |
| | "duration_weeks": random.choice([8, 12, 16, 24]), |
| | "dosages": self._generate_dosages(domain_data["interventions"][0]) |
| | } |
| | |
| | return study |
| | |
| | def _generate_study_title(self, domain: str, domain_data: Dict) -> str: |
| | intervention = random.choice(domain_data["interventions"]) |
| | outcome = random.choice(domain_data["outcomes"]) |
| | |
| | titles = [ |
| | f"Effects of {intervention} on {outcome}: A randomized controlled trial", |
| | f"{intervention} supplementation improves {outcome} in healthy adults", |
| | f"Clinical evaluation of {intervention} for {outcome} optimization", |
| | f"Randomized trial of {intervention} in {outcome} enhancement" |
| | ] |
| | |
| | return random.choice(titles) |
| | |
| | def _generate_study_abstract(self, domain: str, domain_data: Dict) -> str: |
| | intervention = domain_data["interventions"][0] |
| | biomarker = random.choice(domain_data["biomarkers"]) |
| | outcome = random.choice(domain_data["outcomes"]) |
| | |
| | abstract = f""" |
| | Background: {intervention} has shown promise in preliminary studies for health optimization. |
| | |
| | Objective: To evaluate the effects of {intervention} supplementation on {biomarker} and related health outcomes. |
| | |
| | Methods: Randomized, double-blind, placebo-controlled trial with {random.randint(120, 250)} participants aged 40-65 years. |
| | Subjects received {intervention} or placebo for {random.randint(12, 24)} weeks. |
| | |
| | Results: {intervention} supplementation significantly improved {outcome} compared to placebo (p<0.05). |
| | {biomarker.capitalize()} showed {random.randint(15, 35)}% improvement from baseline. |
| | Secondary outcomes included improved quality of life and no serious adverse events. |
| | |
| | Conclusions: {intervention} supplementation provides significant benefits for {outcome} with excellent safety profile. |
| | """.strip() |
| | |
| | return abstract |
| | |
| | def _generate_dosages(self, intervention: str) -> List[str]: |
| | dosage_ranges = { |
| | "NAD+": ["250mg", "500mg", "1000mg"], |
| | "resveratrol": ["100mg", "250mg", "500mg"], |
| | "berberine": ["500mg", "1000mg", "1500mg"], |
| | "omega-3": ["1000mg", "2000mg", "3000mg"], |
| | "magnesium": ["200mg", "400mg", "600mg"], |
| | "curcumin": ["500mg", "1000mg", "1500mg"] |
| | } |
| | |
| | for key in dosage_ranges: |
| | if key.lower() in intervention.lower(): |
| | return random.sample(dosage_ranges[key], min(2, len(dosage_ranges[key]))) |
| | |
| | return ["500mg", "1000mg"] |
| |
|
| | class HealthProfileGenerator: |
| | """Generates realistic health profiles based on medical studies""" |
| | |
| | def __init__(self): |
| | self.severity_levels = { |
| | "optimal": {"multiplier": 1.0, "description": "excellent baseline health with optimization focus"}, |
| | "mild": {"multiplier": 1.2, "description": "minor health concerns with good overall function"}, |
| | "moderate": {"multiplier": 1.5, "description": "noticeable health issues requiring intervention"}, |
| | "severe": {"multiplier": 2.0, "description": "significant health challenges needing intensive protocols"} |
| | } |
| | |
| | def generate_profile_from_study(self, study: Dict[str, Any], severity: str = "moderate") -> Dict[str, Any]: |
| | """Generate complete health profile based on study data and severity level""" |
| | domain = study.get("domain", "longevity") |
| | severity_data = self.severity_levels.get(severity, self.severity_levels["moderate"]) |
| | multiplier = severity_data["multiplier"] |
| | |
| | age = random.randint(35, 65) |
| | gender = random.choice(["male", "female"]) |
| | |
| | labs = self._generate_lab_values(domain, multiplier) |
| | |
| | health_profile = { |
| | "user_tests_result_data": { |
| | "Labs": labs, |
| | "gut_microbiome": self._generate_gut_microbiome(severity), |
| | "epigenetics": self._generate_epigenetics(severity), |
| | "wearables": self._generate_wearables(severity), |
| | "cgm": self._generate_cgm(severity) |
| | }, |
| | "user_query": self._generate_user_query(study, age, gender, severity), |
| | "source_study": { |
| | "pmid": study.get("pmid"), |
| | "domain": domain, |
| | "severity": severity, |
| | "title": study.get("title") |
| | } |
| | } |
| | |
| | return health_profile |
| | |
| | def _generate_lab_values(self, domain: str, multiplier: float) -> Dict[str, Any]: |
| | """Generate realistic lab values based on domain and severity""" |
| | base_labs = { |
| | "blood_tests": { |
| | "systolic_bp": int(random.randint(120, 140) * multiplier), |
| | "diastolic_bp": int(random.randint(70, 90) * multiplier), |
| | "total_cholesterol": int(random.randint(180, 220) * multiplier), |
| | "ldl": int(random.randint(100, 140) * multiplier), |
| | "hdl": int(random.randint(40, 60) / multiplier), |
| | "triglycerides": int(random.randint(80, 150) * multiplier), |
| | "apoB": int(random.randint(70, 110) * multiplier), |
| | "lp_a": random.randint(10, 50) |
| | }, |
| | "inflammatory": { |
| | "hscrp": round(random.uniform(1.0, 4.0) * multiplier, 1), |
| | "esr": int(random.randint(5, 25) * multiplier), |
| | "il6": round(random.uniform(1.0, 5.0) * multiplier, 1), |
| | "tnf_alpha": round(random.uniform(1.0, 3.0) * multiplier, 1), |
| | "oxidative_stress_markers": "elevated" if multiplier > 1.3 else "normal", |
| | "homocysteine": round(random.uniform(8, 15) * multiplier, 1) |
| | }, |
| | "nutritional": { |
| | "vitamin_d": int(random.randint(25, 50) / multiplier), |
| | "b12": random.randint(250, 400), |
| | "folate": round(random.uniform(6, 14), 1), |
| | "iron": random.randint(60, 120), |
| | "ferritin": random.randint(30, 100), |
| | "selenium": random.randint(80, 120), |
| | "zinc": random.randint(70, 110), |
| | "magnesium": round(random.uniform(1.5, 2.2), 1), |
| | "omega3_index": round(random.uniform(4, 8) / multiplier, 1) |
| | } |
| | } |
| | |
| | if domain == "metabolic_health": |
| | base_labs["metabolic"] = { |
| | "fasting_glucose": int(random.randint(85, 110) * multiplier), |
| | "hba1c": round(random.uniform(5.2, 6.0) * min(multiplier, 1.4), 1), |
| | "insulin_fasting": round(random.uniform(5, 15) * multiplier, 1), |
| | "homa_ir": round(random.uniform(1.5, 4.0) * multiplier, 1) |
| | } |
| | |
| | return base_labs |
| | |
| | def _generate_gut_microbiome(self, severity: str) -> str: |
| | scores = { |
| | "optimal": random.uniform(8.5, 9.5), |
| | "mild": random.uniform(7.0, 8.5), |
| | "moderate": random.uniform(5.5, 7.0), |
| | "severe": random.uniform(3.5, 5.5) |
| | } |
| | |
| | score = scores.get(severity, 6.5) |
| | |
| | descriptions = { |
| | "optimal": "excellent diversity with optimal bacterial balance", |
| | "mild": "good diversity with minor imbalances", |
| | "moderate": "moderate dysbiosis with reduced beneficial bacteria", |
| | "severe": "significant dysbiosis with pathogenic overgrowth" |
| | } |
| | |
| | desc = descriptions.get(severity, "moderate dysbiosis") |
| | return f"Diversity score {score:.1f}/10, {desc}, beneficial bacteria {random.randint(60, 90)}%" |
| | |
| | def _generate_epigenetics(self, severity: str) -> str: |
| | age_acceleration = { |
| | "optimal": random.randint(-2, 1), |
| | "mild": random.randint(1, 3), |
| | "moderate": random.randint(3, 6), |
| | "severe": random.randint(6, 12) |
| | } |
| | |
| | acceleration = age_acceleration.get(severity, 4) |
| | telomere_percentile = max(10, random.randint(30, 80) - acceleration * 5) |
| | |
| | return f"Biological age acceleration: {acceleration} years, telomere length: {telomere_percentile}th percentile, DunedinPACE: {round(random.uniform(0.9, 1.4), 2)}" |
| | |
| | def _generate_wearables(self, severity: str) -> Dict[str, int]: |
| | base_ranges = { |
| | "optimal": {"hrv": (55, 75), "rhr": (45, 60), "sleep": (85, 95)}, |
| | "mild": {"hrv": (45, 65), "rhr": (55, 70), "sleep": (75, 85)}, |
| | "moderate": {"hrv": (30, 50), "rhr": (65, 80), "sleep": (60, 75)}, |
| | "severe": {"hrv": (20, 35), "rhr": (75, 95), "sleep": (45, 65)} |
| | } |
| | |
| | ranges = base_ranges.get(severity, base_ranges["moderate"]) |
| | |
| | return { |
| | "hrv_avg": random.randint(*ranges["hrv"]), |
| | "rhr": random.randint(*ranges["rhr"]), |
| | "sleep_score": random.randint(*ranges["sleep"]), |
| | "recovery_score": random.randint(ranges["sleep"][0]-10, ranges["sleep"][1]-5), |
| | "stress_score": random.randint(100-ranges["sleep"][1], 100-ranges["sleep"][0]+20), |
| | "vo2_max": random.randint(25, 50), |
| | "fitness_age": random.randint(30, 65) |
| | } |
| | |
| | def _generate_cgm(self, severity: str) -> str: |
| | glucose_ranges = { |
| | "optimal": (80, 95, 92, 98), |
| | "mild": (85, 105, 85, 95), |
| | "moderate": (95, 120, 70, 85), |
| | "severe": (110, 140, 55, 75) |
| | } |
| | |
| | avg_min, avg_max, tir_min, tir_max = glucose_ranges.get(severity, glucose_ranges["moderate"]) |
| | return f"Average glucose {random.randint(avg_min, avg_max)} mg/dL, time in range {random.randint(tir_min, tir_max)}%" |
| | |
| | def _generate_user_query(self, study: Dict[str, Any], age: int, gender: str, severity: str) -> str: |
| | domain = study.get("domain", "longevity") |
| | |
| | base_queries = { |
| | "longevity": f"I'm a {age}-year-old {gender} interested in longevity optimization and anti-aging protocols", |
| | "metabolic_health": f"I'm a {age}-year-old {gender} with metabolic dysfunction seeking evidence-based glucose control", |
| | "cardiovascular": f"I'm a {age}-year-old {gender} with cardiovascular risk factors wanting heart health optimization", |
| | "cognitive": f"I'm a {age}-year-old {gender} seeking cognitive enhancement and brain health optimization", |
| | "hormonal": f"I'm a {age}-year-old {gender} with hormonal imbalances needing optimization protocols", |
| | "inflammation": f"I'm a {age}-year-old {gender} with chronic inflammation seeking anti-inflammatory interventions" |
| | } |
| | |
| | base_query = base_queries.get(domain, base_queries["longevity"]) |
| | |
| | severity_context = { |
| | "optimal": "I have excellent baseline health but want to push the boundaries of optimization", |
| | "mild": "I have minor health concerns and want targeted interventions", |
| | "moderate": "I have noticeable health issues and need comprehensive protocols", |
| | "severe": "I have significant health challenges and require intensive interventions" |
| | } |
| | |
| | context = severity_context.get(severity, "") |
| | return f"{base_query}. {context}." |
| |
|
| | class AIProtocolGenerator: |
| | """Uses OpenAI to generate health optimization protocols""" |
| | |
| | def __init__(self, api_key: str, model: str = "gpt-4"): |
| | self.client = OpenAI(api_key=api_key) |
| | self.model = model |
| | self.total_cost = 0.0 |
| | |
| | def generate_protocol(self, health_profile: Dict[str, Any], study_context: Dict[str, Any], progress_callback=None) -> Optional[str]: |
| | """Generate comprehensive health optimization protocol""" |
| | |
| | system_prompt = self._create_system_prompt(study_context) |
| | user_prompt = self._create_user_prompt(health_profile, study_context) |
| | |
| | try: |
| | if progress_callback: |
| | progress_callback(f"π Generating protocol using {self.model}...") |
| | |
| | response = self.client.chat.completions.create( |
| | model=self.model, |
| | messages=[ |
| | {"role": "system", "content": system_prompt}, |
| | {"role": "user", "content": user_prompt} |
| | ], |
| | max_tokens=4000, |
| | temperature=0.7, |
| | top_p=0.9 |
| | ) |
| | |
| | self._update_cost(response.usage) |
| | |
| | if progress_callback: |
| | progress_callback(f"β
Protocol generated ({response.usage.total_tokens} tokens)") |
| | |
| | return response.choices[0].message.content |
| | |
| | except Exception as e: |
| | if progress_callback: |
| | progress_callback(f"β Error generating protocol: {e}") |
| | return None |
| | |
| | def _create_system_prompt(self, study_context: Dict[str, Any]) -> str: |
| | domain = study_context.get("domain", "health") |
| | interventions = ", ".join(study_context.get("interventions", [])) |
| | |
| | return f"""You are an advanced AI health optimization system specializing in evidence-based medicine and personalized protocols. |
| | |
| | RESEARCH CONTEXT: |
| | - Domain: {domain} optimization |
| | - Key Interventions: {interventions} |
| | - Evidence Level: Peer-reviewed clinical research |
| | |
| | PROTOCOL REQUIREMENTS: |
| | 1. Executive Summary with current health assessment |
| | 2. Multi-Phase Protocol: |
| | - Phase 1: Foundation (0-3 months) |
| | - Phase 2: Optimization (3-6 months) |
| | - Phase 3: Advanced Enhancement (6-12 months) |
| | 3. Specific supplement protocols with dosages and timing |
| | 4. Lifestyle interventions (exercise, nutrition, sleep) |
| | 5. Monitoring and assessment plans |
| | 6. Expected outcomes with realistic timelines |
| | |
| | STYLE: Professional, authoritative, using Medicine 3.0 terminology. Reference biological age, biomarkers, and cellular health. |
| | |
| | SAFETY: Keep dosages within evidence-based safe ranges. Include monitoring recommendations. |
| | |
| | Generate comprehensive protocols (3000+ words) with actionable precision medicine recommendations.""" |
| | |
| | def _create_user_prompt(self, health_profile: Dict[str, Any], study_context: Dict[str, Any]) -> str: |
| | return f""" |
| | COMPREHENSIVE HEALTH OPTIMIZATION REQUEST: |
| | |
| | Health Profile Analysis: |
| | {json.dumps(health_profile, indent=2)} |
| | |
| | Research Context: |
| | - Study: {study_context.get('title', 'Health Optimization Study')} |
| | - Domain: {study_context.get('domain', 'general health')} |
| | - Key Findings: Based on clinical research showing significant improvements in health biomarkers |
| | |
| | Please analyze this health profile and generate a detailed, personalized optimization protocol. Address the specific biomarker patterns, deficiencies, and health challenges identified in the data. Provide targeted interventions with precise dosing, timing, and monitoring protocols. |
| | """ |
| | |
| | def _update_cost(self, usage): |
| | pricing = { |
| | "gpt-3.5-turbo": {"input": 0.0015, "output": 0.002}, |
| | "gpt-4": {"input": 0.03, "output": 0.06}, |
| | "gpt-4-turbo": {"input": 0.01, "output": 0.03} |
| | } |
| | |
| | model_pricing = pricing.get(self.model, pricing["gpt-4"]) |
| | input_cost = usage.prompt_tokens * model_pricing["input"] / 1000 |
| | output_cost = usage.completion_tokens * model_pricing["output"] / 1000 |
| | |
| | self.total_cost += input_cost + output_cost |
| |
|
| | class HealthDatasetGenerator: |
| | """Complete system that orchestrates the entire dataset generation process""" |
| | |
| | def __init__(self, api_key: str, model: str = "gpt-4"): |
| | self.literature_sim = MedicalLiteratureSimulator() |
| | self.profile_gen = HealthProfileGenerator() |
| | self.protocol_gen = AIProtocolGenerator(api_key, model) |
| | self.generated_examples = [] |
| | |
| | def generate_dataset(self, |
| | domains: List[str] = None, |
| | examples_per_domain: int = 2, |
| | rate_limit_delay: float = 2.0, |
| | progress_callback=None) -> Tuple[List[Dict[str, Any]], str]: |
| | """Generate complete health optimization dataset with progress updates""" |
| | |
| | if domains is None: |
| | domains = ["longevity", "metabolic_health", "cardiovascular", "cognitive"] |
| | |
| | if progress_callback: |
| | progress_callback(f"π Starting Health Dataset Generation") |
| | progress_callback(f"Domains: {domains}") |
| | progress_callback(f"Examples per domain: {examples_per_domain}") |
| | progress_callback(f"Total examples to generate: {len(domains) * examples_per_domain}") |
| | |
| | examples = [] |
| | total_examples = len(domains) * examples_per_domain |
| | current_example = 0 |
| | |
| | for domain in domains: |
| | if progress_callback: |
| | progress_callback(f"\nπ Processing domain: {domain}") |
| | |
| | for i in range(examples_per_domain): |
| | current_example += 1 |
| | try: |
| | if progress_callback: |
| | progress_callback(f" Creating example {i+1}/{examples_per_domain} (Overall: {current_example}/{total_examples})") |
| | |
| | |
| | study = self.literature_sim.generate_study_data(domain) |
| | if progress_callback: |
| | progress_callback(f" π Generated study: {study['title'][:50]}...") |
| | |
| | |
| | severity = random.choice(["mild", "moderate", "severe"]) |
| | health_profile = self.profile_gen.generate_profile_from_study(study, severity) |
| | if progress_callback: |
| | progress_callback(f" π€ Created {severity} health profile") |
| | |
| | |
| | protocol = self.protocol_gen.generate_protocol(health_profile, study, progress_callback) |
| | |
| | if protocol: |
| | training_example = { |
| | "user_context": health_profile, |
| | "response": protocol, |
| | "citations": self._generate_citations(study), |
| | "metadata": { |
| | "domain": domain, |
| | "severity": severity, |
| | "study_pmid": study["pmid"], |
| | "generated_at": datetime.now().isoformat() |
| | } |
| | } |
| | |
| | examples.append(training_example) |
| | if progress_callback: |
| | progress_callback(f" β
Complete example generated") |
| | |
| | |
| | if i < examples_per_domain - 1: |
| | if progress_callback: |
| | progress_callback(f" β³ Rate limit delay: {rate_limit_delay}s") |
| | time.sleep(rate_limit_delay) |
| | |
| | except Exception as e: |
| | if progress_callback: |
| | progress_callback(f" β Error generating example: {e}") |
| | continue |
| | |
| | if progress_callback: |
| | progress_callback(f"\nπ Dataset generation complete!") |
| | progress_callback(f"Generated: {len(examples)} examples") |
| | progress_callback(f"Total cost: ${self.protocol_gen.total_cost:.4f}") |
| | |
| | self.generated_examples = examples |
| | return examples, f"Generated {len(examples)} examples. Total cost: ${self.protocol_gen.total_cost:.4f}" |
| | |
| | def _generate_citations(self, study: Dict[str, Any]) -> Dict[str, List[str]]: |
| | return { |
| | "tier_1_peer_reviewed": [study["pmid"], f"PMC{random.randint(1000000, 9999999)}"], |
| | "tier_2_rct": [f"{study['domain'].upper()}.2024.{random.randint(100000, 999999)}"], |
| | "tier_3_cohort": [f"HEALTH.2023.{random.randint(100000, 999999)}"], |
| | "real_world_cases": ["Evidence-based health optimization protocols"] |
| | } |
| | |
| | def export_dataset(self, filename: str = None) -> Tuple[str, List[str]]: |
| | """Export dataset and return zip file path and file list""" |
| | |
| | if not filename: |
| | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| | filename = f"health_dataset_{timestamp}" |
| | |
| | |
| | files_created = [] |
| | |
| | |
| | raw_data = json.dumps(self.generated_examples, indent=2, ensure_ascii=False) |
| | files_created.append((f"{filename}.json", raw_data)) |
| | |
| | |
| | fine_tune_lines = [] |
| | for example in self.generated_examples: |
| | fine_tune_example = { |
| | "messages": [ |
| | { |
| | "role": "system", |
| | "content": "You are an advanced AI health optimization system that creates evidence-based protocols." |
| | }, |
| | { |
| | "role": "user", |
| | "content": f"Create a health optimization protocol for this profile:\n\n{json.dumps(example['user_context'], indent=2)}" |
| | }, |
| | { |
| | "role": "assistant", |
| | "content": example["response"] |
| | } |
| | ] |
| | } |
| | fine_tune_lines.append(json.dumps(fine_tune_example, ensure_ascii=False)) |
| | |
| | fine_tune_data = '\n'.join(fine_tune_lines) |
| | files_created.append((f"{filename}_fine_tuning.jsonl", fine_tune_data)) |
| | |
| | |
| | sample_size = min(3, len(self.generated_examples)) |
| | sample_data = json.dumps(self.generated_examples[:sample_size], indent=2, ensure_ascii=False) |
| | files_created.append((f"{filename}_samples.json", sample_data)) |
| | |
| | |
| | metadata = { |
| | "generation_info": { |
| | "generated_at": datetime.now().isoformat(), |
| | "total_examples": len(self.generated_examples), |
| | "total_cost": self.protocol_gen.total_cost, |
| | "model_used": self.protocol_gen.model |
| | }, |
| | "domains_covered": list(set(ex["metadata"]["domain"] for ex in self.generated_examples)), |
| | "severity_distribution": { |
| | severity: sum(1 for ex in self.generated_examples if ex["metadata"]["severity"] == severity) |
| | for severity in ["mild", "moderate", "severe"] |
| | } |
| | } |
| | |
| | metadata_data = json.dumps(metadata, indent=2, ensure_ascii=False) |
| | files_created.append((f"{filename}_metadata.json", metadata_data)) |
| | |
| | |
| | zip_buffer = io.BytesIO() |
| | with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: |
| | for file_name, file_content in files_created: |
| | zip_file.writestr(file_name, file_content) |
| | |
| | |
| | zip_filename = f"{filename}.zip" |
| | with open(zip_filename, 'wb') as f: |
| | f.write(zip_buffer.getvalue()) |
| | |
| | file_list = [f[0] for f in files_created] |
| | return zip_filename, file_list |
| |
|
| | |
| | |
| | |
| |
|
| | class HealthDatasetGradioInterface: |
| | """Gradio web interface for the health dataset generator""" |
| | |
| | def __init__(self): |
| | self.generator = None |
| | self.available_domains = list(MedicalLiteratureSimulator().research_domains.keys()) |
| | |
| | def estimate_cost(self, domains, examples_per_domain, model): |
| | """Estimate generation cost""" |
| | if not domains: |
| | return "Please select at least one domain" |
| | |
| | total_examples = len(domains) * examples_per_domain |
| | |
| | cost_per_example = { |
| | "gpt-3.5-turbo": 0.05, |
| | "gpt-4": 0.25, |
| | "gpt-4-turbo": 0.15 |
| | } |
| | |
| | estimated_cost = total_examples * cost_per_example.get(model, 0.25) |
| | |
| | return f"π° Estimated cost: ${estimated_cost:.2f} for {total_examples} examples" |
| | |
| | def validate_inputs(self, api_key, domains, examples_per_domain): |
| | """Validate user inputs""" |
| | if not api_key or not api_key.strip(): |
| | return False, "β Please provide your OpenAI API key" |
| | |
| | if not domains: |
| | return False, "β Please select at least one domain" |
| | |
| | if examples_per_domain < 1 or examples_per_domain > 10: |
| | return False, "β Examples per domain must be between 1 and 10" |
| | |
| | return True, "β
Inputs are valid" |
| | |
| | def generate_dataset_interface(self, api_key, domains, examples_per_domain, model, rate_limit): |
| | """Main dataset generation function for Gradio interface""" |
| | |
| | |
| | is_valid, message = self.validate_inputs(api_key, domains, examples_per_domain) |
| | if not is_valid: |
| | yield message, "", "", None, None |
| | return |
| | |
| | |
| | try: |
| | self.generator = HealthDatasetGenerator(api_key.strip(), model) |
| | except Exception as e: |
| | yield f"β Error initializing generator: {e}", "", "", None, None |
| | return |
| | |
| | |
| | progress_messages = [] |
| | |
| | def progress_callback(message): |
| | progress_messages.append(message) |
| | progress_text = "\n".join(progress_messages[-20:]) |
| | return progress_text |
| | |
| | try: |
| | |
| | yield "π Starting dataset generation...", "", "", None, None |
| | |
| | dataset, summary = self.generator.generate_dataset( |
| | domains=domains, |
| | examples_per_domain=examples_per_domain, |
| | rate_limit_delay=rate_limit, |
| | progress_callback=progress_callback |
| | ) |
| | |
| | if not dataset: |
| | yield "β No examples generated", "", "", None, None |
| | return |
| | |
| | |
| | progress_callback("πΎ Exporting dataset...") |
| | zip_filename, file_list = self.generator.export_dataset() |
| | |
| | |
| | preview = self.create_dataset_preview(dataset) |
| | |
| | |
| | final_progress = progress_callback(f"π Generation complete! Files: {', '.join(file_list)}") |
| | |
| | yield final_progress, summary, preview, zip_filename, file_list |
| | |
| | except Exception as e: |
| | yield f"β Error during generation: {e}", "", "", None, None |
| | |
| | def create_dataset_preview(self, dataset): |
| | """Create a preview of the generated dataset""" |
| | if not dataset: |
| | return "No data to preview" |
| | |
| | preview = "π **Dataset Preview**\n\n" |
| | |
| | |
| | preview += f"**Total Examples:** {len(dataset)}\n" |
| | |
| | |
| | domains = [ex['metadata']['domain'] for ex in dataset] |
| | domain_counts = {d: domains.count(d) for d in set(domains)} |
| | preview += f"**Domain Distribution:** {domain_counts}\n" |
| | |
| | |
| | severities = [ex['metadata']['severity'] for ex in dataset] |
| | severity_counts = {s: severities.count(s) for s in set(severities)} |
| | preview += f"**Severity Distribution:** {severity_counts}\n\n" |
| | |
| | |
| | if dataset: |
| | example = dataset[0] |
| | preview += "**Sample Example:**\n" |
| | preview += f"- **Domain:** {example['metadata']['domain']}\n" |
| | preview += f"- **Severity:** {example['metadata']['severity']}\n" |
| | preview += f"- **User Query:** {example['user_context']['user_query'][:150]}...\n" |
| | preview += f"- **Response Length:** {len(example['response'])} characters\n" |
| | preview += f"- **PMID:** {example['metadata']['study_pmid']}\n" |
| | |
| | return preview |
| | |
| | def analyze_dataset_file(self, zip_file): |
| | """Analyze uploaded dataset file""" |
| | if zip_file is None: |
| | return "No file uploaded" |
| | |
| | try: |
| | |
| | with zipfile.ZipFile(zip_file.name, 'r') as zip_ref: |
| | |
| | json_files = [f for f in zip_ref.namelist() if f.endswith('.json') and not f.endswith('_samples.json') and not f.endswith('_metadata.json')] |
| | |
| | if json_files: |
| | dataset_file = json_files[0] |
| | with zip_ref.open(dataset_file) as f: |
| | dataset = json.load(f) |
| | |
| | analysis = "π **Dataset Analysis**\n\n" |
| | analysis += f"**Total Examples:** {len(dataset)}\n" |
| | analysis += f"**Average Response Length:** {sum(len(ex['response']) for ex in dataset) / len(dataset):.0f} characters\n" |
| | |
| | |
| | long_responses = sum(1 for ex in dataset if len(ex['response']) > 2000) |
| | has_phases = sum(1 for ex in dataset if "Phase" in ex['response']) |
| | has_dosages = sum(1 for ex in dataset if re.search(r'\d+\s*mg', ex['response'])) |
| | |
| | analysis += f"**Quality Metrics:**\n" |
| | analysis += f"- Responses >2000 chars: {long_responses}/{len(dataset)} ({long_responses/len(dataset)*100:.1f}%)\n" |
| | analysis += f"- Responses with phases: {has_phases}/{len(dataset)} ({has_phases/len(dataset)*100:.1f}%)\n" |
| | analysis += f"- Responses with dosages: {has_dosages}/{len(dataset)} ({has_dosages/len(dataset)*100:.1f}%)\n" |
| | |
| | return analysis |
| | else: |
| | return "No dataset JSON file found in zip" |
| | |
| | except Exception as e: |
| | return f"Error analyzing file: {e}" |
| | |
| | def create_interface(self): |
| | """Create the Gradio interface""" |
| | |
| | with gr.Blocks(title="Medical Literature Health Dataset Generator", theme=gr.themes.Soft()) as interface: |
| | |
| | gr.Markdown(""" |
| | # π₯ Medical Literature Health Dataset Generator |
| | |
| | This tool generates synthetic health optimization datasets based on medical literature patterns. |
| | Perfect for training AI models on evidence-based health protocols. |
| | |
| | β οΈ **Important:** Generated content is for research/educational purposes only. Not medical advice. |
| | """) |
| | |
| | with gr.Tab("π Generate Dataset"): |
| | |
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | gr.Markdown("### βοΈ Configuration") |
| | |
| | api_key = gr.Textbox( |
| | label="OpenAI API Key", |
| | placeholder="sk-...", |
| | type="password", |
| | info="Your OpenAI API key for generating protocols" |
| | ) |
| | |
| | domains = gr.CheckboxGroup( |
| | label="Research Domains", |
| | choices=self.available_domains, |
| | value=["longevity", "metabolic_health"], |
| | info="Select medical research domains to include" |
| | ) |
| | |
| | examples_per_domain = gr.Slider( |
| | label="Examples per Domain", |
| | minimum=1, |
| | maximum=10, |
| | value=2, |
| | step=1, |
| | info="Number of examples to generate for each domain" |
| | ) |
| | |
| | model = gr.Dropdown( |
| | label="OpenAI Model", |
| | choices=["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo"], |
| | value="gpt-4", |
| | info="Model for generating protocols (GPT-4 recommended for quality)" |
| | ) |
| | |
| | rate_limit = gr.Slider( |
| | label="Rate Limit Delay (seconds)", |
| | minimum=0.5, |
| | maximum=5.0, |
| | value=2.0, |
| | step=0.5, |
| | info="Delay between API calls to avoid rate limits" |
| | ) |
| | |
| | cost_estimate = gr.Textbox( |
| | label="Cost Estimate", |
| | value="Select domains and examples to see estimate", |
| | interactive=False |
| | ) |
| | |
| | generate_btn = gr.Button( |
| | "π Generate Dataset", |
| | variant="primary", |
| | size="lg" |
| | ) |
| | |
| | with gr.Column(scale=2): |
| | gr.Markdown("### π Progress & Results") |
| | |
| | progress_output = gr.Textbox( |
| | label="Generation Progress", |
| | lines=15, |
| | max_lines=20, |
| | value="Ready to generate dataset...", |
| | interactive=False |
| | ) |
| | |
| | summary_output = gr.Textbox( |
| | label="Generation Summary", |
| | lines=3, |
| | interactive=False |
| | ) |
| | |
| | preview_output = gr.Markdown( |
| | label="Dataset Preview", |
| | value="Dataset preview will appear here..." |
| | ) |
| | |
| | with gr.Row(): |
| | download_file = gr.File( |
| | label="π₯ Download Generated Dataset", |
| | interactive=False |
| | ) |
| | |
| | file_list = gr.Textbox( |
| | label="Generated Files", |
| | placeholder="Files included in download will be listed here", |
| | interactive=False |
| | ) |
| | |
| | with gr.Tab("π Analyze Dataset"): |
| | gr.Markdown("### π Dataset Analysis") |
| | gr.Markdown("Upload a generated dataset zip file to analyze its quality and structure.") |
| | |
| | with gr.Row(): |
| | with gr.Column(): |
| | upload_file = gr.File( |
| | label="Upload Dataset Zip File", |
| | file_types=[".zip"] |
| | ) |
| | |
| | analyze_btn = gr.Button( |
| | "π Analyze Dataset", |
| | variant="secondary" |
| | ) |
| | |
| | with gr.Column(): |
| | analysis_output = gr.Markdown( |
| | label="Analysis Results", |
| | value="Upload a dataset file to see analysis..." |
| | ) |
| | |
| | with gr.Tab("βΉοΈ Information"): |
| | gr.Markdown(""" |
| | ### π How It Works |
| | |
| | 1. **Literature Simulation**: Creates realistic medical studies with proper abstracts, interventions, and outcomes |
| | 2. **Health Profile Generation**: Generates comprehensive health profiles based on study domains and severity levels |
| | 3. **AI Protocol Generation**: Uses OpenAI to create detailed health optimization protocols |
| | 4. **Dataset Export**: Outputs data in multiple formats including OpenAI fine-tuning format |
| | |
| | ### π― Output Files |
| | |
| | - **`dataset.json`**: Complete raw dataset |
| | - **`dataset_fine_tuning.jsonl`**: OpenAI fine-tuning format |
| | - **`dataset_samples.json`**: Sample examples for review |
| | - **`dataset_metadata.json`**: Generation statistics and info |
| | |
| | ### π° Cost Information |
| | |
| | - **GPT-3.5-turbo**: ~$0.05 per example |
| | - **GPT-4**: ~$0.25 per example |
| | - **GPT-4-turbo**: ~$0.15 per example |
| | |
| | ### β οΈ Important Notes |
| | |
| | - Generated content is for **research/educational purposes only** |
| | - **Not medical advice** - always consult healthcare professionals |
| | - Include appropriate medical disclaimers when using generated content |
| | - Review sample outputs before using in production |
| | |
| | ### π§ Recommended Settings |
| | |
| | - **Start small**: Generate 2-4 examples first to test quality |
| | - **Use GPT-4**: Better quality than GPT-3.5-turbo |
| | - **Rate limiting**: Use 2+ second delays to avoid API limits |
| | - **Multiple domains**: Include diverse domains for comprehensive dataset |
| | """) |
| | |
| | |
| | |
| | |
| | def update_cost_estimate(domains, examples_per_domain, model): |
| | return self.estimate_cost(domains, examples_per_domain, model) |
| | |
| | for input_component in [domains, examples_per_domain, model]: |
| | input_component.change( |
| | fn=update_cost_estimate, |
| | inputs=[domains, examples_per_domain, model], |
| | outputs=[cost_estimate] |
| | ) |
| | |
| | |
| | generate_btn.click( |
| | fn=self.generate_dataset_interface, |
| | inputs=[api_key, domains, examples_per_domain, model, rate_limit], |
| | outputs=[progress_output, summary_output, preview_output, download_file, file_list] |
| | ) |
| | |
| | |
| | analyze_btn.click( |
| | fn=self.analyze_dataset_file, |
| | inputs=[upload_file], |
| | outputs=[analysis_output] |
| | ) |
| | |
| | return interface |
| |
|
| | |
| | |
| | |
| |
|
| | def main(): |
| | """Launch the Gradio interface""" |
| | |
| | print("π Launching Medical Literature Health Dataset Generator") |
| | print("This will start a web interface accessible through your browser") |
| | |
| | |
| | interface_creator = HealthDatasetGradioInterface() |
| | interface = interface_creator.create_interface() |
| | |
| | |
| | interface.launch( |
| | share=True, |
| | server_name="0.0.0.0", |
| | server_port=7860, |
| | show_error=True, |
| | quiet=False |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | main() |
| | |
| | |
| | |