
    yibI                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
ZddlmZ ddlZ G d d      Zd Zed	k(  r e        yy)
z]
Medical Transcript Entity Extraction System
Using Local LLM (Llama 3.1 8B Instruct / Gemma)
    N)Path)DictListTuple)datetimec                       e Zd ZdZddZdedefdZdedefdZdefdZd	edefd
Z	dedefdZ
dedefdZdedefdZdedee   fdZdee   defdZdee   defdZy)MedicalTranscriptAnalyzerz=Analyzes medical transcripts and extracts structured entitiesc                     || _         g dg dg dg dg dg dd| _        g | _        | j                  j                         D ],  \  }}|D ]"  }| j                  j	                  | d|        $ . y	)
z
        Initialize the analyzer
        
        Args:
            model_name: Name of the LLM model (llama3.1:8b or gemma)
        )zBlurred visionzFluctuating visionz!Dark or floating spots (floaters)zPoor night visionzFaded colors)MicroaneurysmszHard exudatesz!Soft exudates (cotton wool spots)zRetinal hemorrhageszMacular edemaNeovascularizationzScar tissue)zDilated eye examz"Optical coherence tomography (OCT)zFundus photographyzFluorescein angiography)zBlood sugar levels (HbA1c)zBlood pressure (BP)zCholesterol levels)zIntravitreal injectionszLaser treatmentz#Surgical interventions (vitrectomy))zDuration of diabeteszAge over 65zSmoking status)SymptomszOphthalmic FindingszDiagnostic ToolszSystemic Risk FactorszTreatment OptionszDemographics/History: N)
model_nameentitiesall_entitiesitemsappend)selfr   categoryr   items        C/home/sandhiya/DR-Transcripts-Claude/medical_transcript_analyzer.py__init__z"MedicalTranscriptAnalyzer.__init__   s     %$!&
"
%C&
R #}}224 	@OHe @!!((H:Rv)>?@	@    pdf_pathreturnc                    d}	 t        |d      5 }t        j                  |      }|j                  D ]  }||j	                         dz   z  } 	 ddd       |S # 1 sw Y   |S xY w# t
        $ r}t        d| d|        Y d}~|S d}~ww xY w)zExtract text from PDF file rb
NzError reading r   )openPyPDF2	PdfReaderpagesextract_text	Exceptionprint)r   r   textfile
pdf_readerpagees          r   extract_text_from_pdfz/MedicalTranscriptAnalyzer.extract_text_from_pdfP   s    	4h% 7#--d3
&,, 7DD--/$66D77 7   	4N8*Bqc233	4s4   A% =AA% A"A% "A% %	B
.BB
filenamec                     t        |      j                  }t        j                  d|      }|r|j	                  d      }|S |S )zt
        Extract conversation ID from filename
        Handle multi-part conversations (e.g., part1, part2)
        zdr_([A-Z]{2}\d{4})   )r   stemresearchgroup)r   r-   	base_namematch
patient_ids        r   identify_conversation_idz2MedicalTranscriptAnalyzer.identify_conversation_id\   sB     N''	 		/;QJr   c                 
    d}|S )z3Create few-shot examples for better LLM performanceaU  
EXAMPLE 1:
Doctor: "How is your vision lately?"
Patient: "I've been seeing floaters, like dark spots moving around. It's been getting worse over the past month."
Doctor: "Any blurred vision?"
Patient: "Yes, especially when I try to read. Everything looks fuzzy."

EXTRACTION:
{
  "Symptoms: Blurred vision": {"present": true, "sentiment": "negative", "details": "Vision is fuzzy, especially when reading"},
  "Symptoms: Dark or floating spots (floaters)": {"present": true, "sentiment": "negative", "details": "Dark spots moving around, worsening over past month"},
  "Patient Concerns": "Vision problems affecting daily activities like reading",
  "Severity": "Moderate to severe - worsening over time"
}

EXAMPLE 2:
Doctor: "What brings you in today?"
Patient: "I've had diabetes for 15 years now. My blood sugar has been hard to control."
Doctor: "What's your latest HbA1c?"
Patient: "It was 8.5 last month. I'm really worried about my eyes."

EXTRACTION:
{
  "Demographics/History: Duration of diabetes": {"present": true, "sentiment": "neutral", "details": "15 years"},
  "Systemic Risk Factors: Blood sugar levels (HbA1c)": {"present": true, "sentiment": "negative", "details": "HbA1c 8.5 - poorly controlled"},
  "Patient Concerns": "Worried about eye complications from diabetes",
  "Patient Goals": "Better control of blood sugar to prevent eye damage"
}
 )r   few_shot_exampless     r   create_few_shot_promptz0MedicalTranscriptAnalyzer.create_few_shot_promptl   s    : ! r   transcript_textc                     dj                  | j                  D cg c]  }d| 	 c}      }d| j                          d|dd  d| d}|S c c}w )	z>Build the prompt for entity extraction with sentiment analysisr   z- zYou are a medical data extraction expert. Your task is to analyze a doctor-patient conversation transcript and extract specific medical entities.

z5

Now analyze the following transcript:

TRANSCRIPT:
Ni  a    

TASK:
1. For each entity below, determine if it is mentioned (present/absent)
2. If present, extract:
   - Sentiment (positive/neutral/negative)
   - Specific details/context from the conversation
3. Also identify:
   - Main questions asked by the doctor
   - Patient's concerns and worries
   - Patient's goals and what they hope to achieve
   - Occurrence patterns (how often, when)
   - Severity level mentioned

ENTITIES TO EXTRACT:
a  

RESPONSE FORMAT (JSON only):
{
  "entity_name": {
    "present": true/false,
    "sentiment": "positive/neutral/negative",
    "details": "specific quote or summary from conversation",
    "occurrence": "frequency/timing if mentioned",
    "severity": "mild/moderate/severe if mentioned"
  },
  "doctor_questions": ["question 1", "question 2"],
  "patient_concerns": "summary of patient worries",
  "patient_goals": "what patient hopes to achieve",
  "overall_severity": "assessment"
}

Respond ONLY with valid JSON, no additional text.)joinr   r;   )r   r<   entityentity_listprompts        r   build_extraction_promptz1MedicalTranscriptAnalyzer.build_extraction_prompt   sx     iiT=N=N O62fX OP   
 $    2-'5R W !Ps   ArA   c           	      0   	 ddl }|j                  d| j                  |ddddd	      }|j                  d
k(  r"|j	                         }|j                  dd      S t        d|j                          y# t        $ r}t        d|        Y d}~yd}~ww xY w)zm
        Call local LLM using Ollama API
        Make sure Ollama is running: ollama run llama3.1:8b
        r   Nz#http://localhost:11434/api/generateFg?g?)modelrA   streamtemperaturetop_px   )jsontimeout   responser   zLLM API Error: zError calling LLM: )requestspostr   status_coderI   getr&   r%   )r   rA   rM   rL   resultr+   s         r   call_llmz"MedicalTranscriptAnalyzer.call_llm   s    
	}}5!__$##&   % 
H ##s*!zz*b11(<(<'=>? 	's+,	s   AA4 A4 4	B=BBrL   c                     	 t        j                  d|t         j                        }|r%|j                         }t	        j
                  |      S i S # t        j                  $ r}t        d|        i cY d}~S d}~ww xY w)zParse JSON response from LLMz\{.*\}zJSON parsing error: N)r1   r2   DOTALLr3   rI   loadsJSONDecodeErrorr&   )r   rL   
json_matchjson_strr+   s        r   parse_llm_responsez,MedicalTranscriptAnalyzer.parse_llm_response   sm    
	9h		BJ%++-zz(++	## 	(,-I	s$   AA A A=#A82A=8A=c                    t        dt        |      j                          | j                  |      }|j	                         st        d|        i S | j                  t        |      j                        }| j                  |      }t        d| j                   d       | j                  |      }| j                  |      }||d<   t        |      j                  |d<   t        |      |d<   |S )zAnalyze a single transcriptz
Analyzing: z Warning: No text extracted from zCalling z...conversation_idr-   raw_text_length)r&   r   namer,   stripr7   rB   r   rR   rY   len)r   r   r<   r[   rA   llm_responseextracted_datas          r   analyze_transcriptz,MedicalTranscriptAnalyzer.analyze_transcript   s     	d8n11234 44X>$$&4XJ?@I 77X8K8KL --o> 	)-.}}V, 00> -<()%)(^%8%8z",/,@()r   pdf_directoryc                     t        t        |      j                  d            }t        dt	        |       d       g }t        |      D ]0  }| j                  t        |            }|s |j                  |       2 |S )z(Analyze all PDF transcripts in directoryz*.pdfzFound z
 PDF files)	listr   globr&   r_   sortedrb   strr   )r   rc   	pdf_filesresultspdf_filerQ   s         r   analyze_all_transcriptsz1MedicalTranscriptAnalyzer.analyze_all_transcripts  sw     m,11':;	s9~&j12y) 	'H,,S];Fv&	'
 r   rj   output_pathc                    t        j                  |d      }t        t        |D cg c]  }|j	                  dd       c}            t        |      t        j                         j                  d      | j                  d}t        j                  |g      j                  |dd	       g }|D ]  }|j	                  dd      |j	                  d
d      d}| j                  D ]i  }	|j	                  |	i       }
t        |
t              r5|
j	                  dd      rdnd||	 d<   |
j	                  dd      ||	 d<   Zd||	 d<   d||	 d<   k |j                  |        t        j                  |      }|j                  |dd	       g }|D ]  }| j                  D ]  }	|j	                  |	i       }
t        |
t              s&|
j	                  dd      s9|j                  |j	                  dd      |j	                  d
d      |	|
j	                  dd      |
j	                  dd      |
j	                  dd      |
j	                  dd      d         t        j                  |      }|j                  |dd	       g }|D ]e  }|j	                  dg       }|s|j                  |j	                  dd      t        |t              rdj!                  |      n
t#        |      d       g |r)t        j                  |      }|j                  |dd	       g }|D ]i  }|j                  |j	                  dd      |j	                  d
d      |j	                  dd      |j	                  dd      |j	                  dd      d        k t        j                  |      }|j                  |d!d	       |j%                          t'        d"|        y#c c}w )$z6Create comprehensive Excel output with multiple sheetsopenpyxl)enginer[   r   %Y-%m-%d %H:%M)zTotal ConversationszTotal DocumentszAnalysis Datez
Model UsedSummaryF)
sheet_nameindexr-   )Conversation_IDFilenamepresentYesNo_Present	sentimentzN/A
_SentimentEntity_Matrixdetails
occurrenceseverity)ru   rv   Entity	SentimentDetails
OccurrenceSeverityDetailed_Extractionsdoctor_questionsr   )ru   	QuestionsDoctor_Questionspatient_concernspatient_goalsoverall_severity)ru   rv   ConcernsGoalsOverall_SeverityPatient_Perspectivesu    
✓ Tabulated output saved to: N)pdExcelWriterr_   setrP   r   nowstrftimer   	DataFrameto_excelr   
isinstancedictr   re   r>   rh   closer&   )r   rj   rm   writerrsummary_dataentity_matrixrQ   rowr?   entity_datadf_entitiesdetailed_datadf_detailedquestions_data	questionsdf_questionspatient_data
df_patients                      r   create_tabulated_outputz1MedicalTranscriptAnalyzer.create_tabulated_output  s    J? $'sRY+ZQAEE2CR,H+Z'[#\"7|%\\^445EF//	
 	l^$--fRW-X  	&F#)::.?#D"JJz26C ++ 7$jj4k408C	SX8Yu_cC6((+,1<e1TC6(*-./3C6((+,16C6(*-.7   %!	&$ ll=1VuM  	F++ $jj4k40[__YPU5V!((+1::6G+L$*JJz2$>"(%0__["%E#.??9b#A&1oolB&G$/OOJ$C* 	 ll=1V0FeT  	F

#5r:I%%'-zz2CR'H9CIt9T9!5Z]^gZh' 	 <<7L!!&5Gu!U  	F#)::.?#D"JJz26"JJ'92>OR8$*JJ/A2$F! 	 \\,/
F/EUS1+?@a ,[s   Oc                 h   t        |dd      5 }|j                  d       |j                  d       |j                  d       t        t        |D cg c]  }|j	                  dd       c}            }|j                  d	       |j                  d
| d       |j                  dt        |       d       |j                  d| j
                   d       |j                  dt        j                         j                  d       d       |j                  d       g }|D ]6  }|j	                  dg       }t        |t              s&|j                  |       8 t        t        |            }	t        |	dd d      D ]  \  }
}|j                  d|
 d| d        |j                  d       |j                  d       i }| j                  D ]!  t        fd|D              }|dkD  s||<   # t        |j!                         d d      }|dd D ]>  \  }|t        |      z  dz  }|j                  d  d!| d"t        |       d#|d$d%	       @ |j                  d       |j                  d&       dddd'}|D ]Z  }| j                  D ]I  |j	                  i       }t        |t"              s&|j	                  d(d)      }||v s=||xx   dz  cc<   K \ t        |j%                               }|j!                         D ]>  \  }}|dkD  r||z  dz  nd}|j                  d |j'                          d!| d#|d$d%       @ |j                  d       |j                  d       |j                  d*       |j                  d       ddd       t)        d+|        yc c}w # 1 sw Y   xY w),z$Generate a comprehensive text reportwutf-8encodingzQ================================================================================
z#MEDICAL TRANSCRIPT ANALYSIS REPORT
zR================================================================================

r[   r   z1. SUMMARY STATISTICS
z!   - Total Unique Conversations: r   z   - Total Documents Analyzed: z   - Model Used: z   - Analysis Date: rq   z

z2. COMMON DOCTOR QUESTIONS
r   N
   r/   z   z. z3. ENTITY PREVALENCE
c              3   h   K   | ])  }|j                  i       j                  d d      s&d + yw)rw   Fr/   N)rP   ).0r   r?   s     r   	<genexpr>zEMedicalTranscriptAnalyzer.generate_analysis_report.<locals>.<genexpr>  s,     Z!!%%2C2G2G	SX2YAZs   '22r   c                     | d   S )Nr/   r9   )xs    r   <lambda>zDMedicalTranscriptAnalyzer.generate_analysis_report.<locals>.<lambda>  s
    !A$ r   T)keyreverse   d   z   - r   /z (z.1fz%)
z4. SENTIMENT OVERVIEW
)positiveneutralnegativer{   r   zEND OF REPORT
u   ✓ Analysis report saved to: )r    writer_   r   rP   r   r   r   r   r   re   extend	enumerater   sumrg   r   r   values
capitalizer&   )r   rj   rm   fr   unique_conversationsall_questionsrQ   r   unique_questionsiqentity_countscountsorted_entities
percentagesentiment_countsr   r{   totalr?   s                       @r   generate_analysis_reportz2MedicalTranscriptAnalyzer.generate_analysis_reports  s    +sW5 ;	%GGO$GG:;GG%& $'sRY+ZQAEE2CR,H+Z'[#\ GG-/GG78L7MRPQGG5c'l^2FGGG''8;<GG*8<<>+B+BCS+T*UUYZ[ GG24M! 4"JJ'92>	i.!((34
  $C$67!"23B"7; *1#aS1#R()*GGDM GG,.M++ 2ZwZZ19,1M&)2
 %]%8%8%:X\]O!0"!5 X#c'l2c9
%xr%#g,r*SAQQUVWX GGDM GG-/,-!K! ="// =F"(**VR"8K!+t4$/OOK$K	$(88,Y71<7== (//12E$4$:$:$< Y 	56;aieemc1Q
%	 4 4 67r%:cBRRVWXY GGDMGGO$GG%&GGO$w;	%z 	.{m<=o ,[;	% ;	%s:   AN(N#+CN(
B"N(-CN(6N(B?N(#N((N1N)llama3.1:8b)__name__
__module____qualname____doc__r   rh   r,   r7   r;   rB   rR   r   rY   rb   r   rl   r   r   r9   r   r   r	   r	      s    G6@p
c 
c 
    !  !D.s .s .`s s >3 4 3 4 @S T$Z WAtDz WA WAr@>T
 @> @>r   r	   c                  b   t        d       t        d       t        d       t                t        j                  j                  d      } d}t        j                  |d       t        d       t        d       t        d	       t                t        d
      j                         xs d}|dk(  rdnd}t        d|        t        d       t        d|        t                t        d       t        |      }t        d       |j                  |       }|st        d       yt        j                         j                  d      }| d| d}t        |dd      5 }t        j                  ||dd       ddd       t        d|        | d | d!}	|j                  ||	       | d"| d#}
|j!                  ||
       t        d$       t        d%       t        d       t        d&       t        d'|        t        d(|	        t        d)|
        t                y# 1 sw Y   xY w)*zMain execution functionzP================================================================================z+MEDICAL TRANSCRIPT ENTITY EXTRACTION SYSTEMz#~/DR-Transcripts-Claude/pdf_uploadsz,/home/sandhiya/DR-Transcripts-Claude/outputsT)exist_okzAvailable models:z&1. llama3.1:8b (Llama 3.1 8B Instruct)z2. gemma:7b (Google Gemma 7B)z"Select model (1 or 2, default=1): 1r   zgemma:7bz
Using model: z;
Note: Make sure Ollama is running with the selected model!zRun: ollama run zPress Enter to continue...)r   z
Starting analysis...z@No results generated. Check if PDFs exist and Ollama is running.Nz%Y%m%d_%H%M%Sz/extracted_data_z.jsonr   r   r      F)indentensure_asciiu   ✓ Raw JSON saved to: z/analysis_tabulation_z.xlsxz/analysis_report_z.txtzQ
================================================================================zANALYSIS COMPLETE!z
Output files:z1. z2. z3. )r&   ospath
expandusermakedirsinputr^   r	   rl   r   r   r   r    rI   dumpr   r   )PDF_DIRECTORY
OUTPUT_DIRmodel_choicer   analyzerrj   	timestampjson_outputr   excel_outputreport_outputs              r   mainr     s    
(O	
78	(O	G GG&&'LMM?J KK
T* 

	
23	
)*	G=>DDFM#L".#"5:J	OJ<
()	
HI	ZL
)*	G	
&' )J?H 

"#..}=GPQ ''8I  L 05AK	k3	1 <Q		'1QU;<	#K=
12 !\!6ykGL$$Wl; "l"3I;dCM%%g}=	/	
	(O	O	C}
	C~
	C
 	G'< <s    H%%H.__main__)r   rI   r   r1   pathlibr   typingr   r   r   pandasr   r   r!   r	   r   r   r9   r   r   <module>r      sQ   
  	 	  $ $   ^> ^>BCL zF r   