o
    RTh                     @   s8  d dl Z d dlZd dlmZmZmZmZ d dlmZ d dlm	Z
 G dd deZdd Zd	d
 ZddgfddZd<ddZddgfddZdd Zdd Zdd Zdd Zdd Zdd Zdd  Zddgfd!d"Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Z d3d4 Z!d=d6d7Z"d8d9 Z#d:d; Z$dS )>    N)AnyCallable
NamedTupleOptional)assert_has_pandas)pandasc                   @   s   e Zd ZU eed< dZee ed< dZee ed< dZee	e
ge
f  ed< dZee ed< dZee	e
ge
f  ed< dZee ed< dS )	RemediationnameNimmediate_msgnecessary_msgnecessary_fnoptional_msgoptional_fn	error_msg)__name__
__module____qualname__str__annotations__r
   r   r   r   r   r   r   r   r    r   r   T/home/air/segue/gemini/backup/venv/lib/python3.10/site-packages/openai/validators.pyr   	   s   
 r   c                 C   s8   d}t | |kr
dnd}dt |  d| }td|dS )z
    This validator will only print out the number of examples and recommend to the user to increase the number of examples if less than 100.
    d    z. In general, we recommend having at least a few hundred examples. We've found that performance tends to linearly increase for every doubling of the number of examplesz
- Your file contains z prompt-completion pairsnum_examplesr	   r
   )lenr   )dfMIN_EXAMPLESoptional_suggestionr
   r   r   r   num_examples_validator   s   r   c                    s   dd  d}d}d}d}| j vr7dd | j D v r1 fdd}|}d d	}d
 d}nd d}td||||dS )z[
    This validator will ensure that the necessary column is present in the dataframe.
    c                    s2    fdd| j D }| j|d   idd | S )Nc                    s    g | ]}t |  kr|qS r   r   lower.0ccolumnr   r   
<listcomp>)   s     zInecessary_column_validator.<locals>.lower_case_column.<locals>.<listcomp>r   T)columnsinplace)r(   renamer!   )r   r&   colsr   r%   r   lower_case_column(   s   z5necessary_column_validator.<locals>.lower_case_columnNc                 S   s   g | ]}t | qS r   r    r"   r   r   r   r'   3       z.necessary_column_validator.<locals>.<listcomp>c                    
    | S Nr   )r   r,   necessary_columnr   r   lower_case_column_creator5      
z=necessary_column_validator.<locals>.lower_case_column_creatorz
- The `z ` column/key should be lowercasezLower case column name to ``z^` column/key is missing. Please make sure you name your columns/keys appropriately, then retryr1   )r	   r
   r   r   r   )r(   r   )r   r1   r
   r   r   r   r2   r   r0   r   necessary_column_validator#   s(   

r5   prompt
completionc                    s   g }d}d}d}t | jdkrLfdd| jD }d}|D ]  fdd|D }t |dkr9|d  d	  d
7 }qd| | }d| }fdd}td|||dS )zK
    This validator will remove additional columns from the dataframe.
    N   c                    s   g | ]}| vr|qS r   r   r"   fieldsr   r   r'   R   r-   z/additional_column_validator.<locals>.<listcomp>r   c                    s   g | ]} |v r|qS r   r   r"   )acr   r   r'   U   r-   r   z9
  WARNING: Some of the additional columns/keys contain `z<` in their name. These will be ignored, and the column/key `z`` will be used instead. This could also result from a duplicate column/key in the provided file.zh
- The input file should contain exactly two columns/keys per row. Additional columns/keys present are: z Remove additional columns/keys: c                    s   |   S r/   r   xr9   r   r   r   [   s   z1additional_column_validator.<locals>.necessary_fnadditional_columnr	   r
   r   r   )r   r(   r   )r   r:   additional_columnsr   r
   r   warn_messagedupsr   )r;   r:   r   additional_column_validatorI   s*   
rC   c                    s   d}d}d}|    dd  s|     rG|   dk|    B }|  j|  }d  d| } fdd}d	t| d
  d}td  |||dS )zA
    This validator will ensure that no completion is empty.
    Nc                 S   s   | dkS )Nr   r   r<   r   r   r   <lambda>n   s    z+non_empty_field_validator.<locals>.<lambda>r   z
- `z?` column/key should not contain empty strings. These are rows: c                    s   | |   dk j  gdS )Nr   subset)dropnar<   fieldr   r   r   s   s   z/non_empty_field_validator.<locals>.necessary_fnRemove z rows with empty sempty_r?   )applyanyisnullreset_indexindextolistr   r   )r   rI   r   r   r
   
empty_rowsempty_indexesr   rH   r   non_empty_field_validatorf   s   &rU   c                    s   | j  d}|  j|  }d}d}d}t|dkr9dt| dd  d| }dt| d	} fd
d}td|||dS )zY
    This validator will suggest to the user to remove duplicate rows if they exist.
    rE   Nr   
- There are z duplicated -z sets. These are rows: rJ   z duplicate rowsc                    s   | j  dS )NrE   )drop_duplicatesr<   r9   r   r   r         z.duplicated_rows_validator.<locals>.optional_fnduplicated_rowsr	   r
   r   r   )
duplicatedrP   rQ   rR   r   joinr   )r   r:   rZ   duplicated_indexesr
   r   r   r   r9   r   duplicated_rows_validator   s    r_   c                    s|   d}d}d}t | }|dkr6dd   | tdkr6dt d d}d	t d
} fdd}td|||dS )zW
    This validator will suggest to the user to remove examples that are too long.
    Nopen-ended generationc                 S   s$   | j dd dd}|  j|  S )Nc                 S   s   t | jt | j dkS )Ni'  )r   r6   r7   r<   r   r   r   rD      r-   zClong_examples_validator.<locals>.get_long_indexes.<locals>.<lambda>   )axis)rM   rP   rQ   rR   )dlong_examplesr   r   r   get_long_indexes   s   z1long_examples_validator.<locals>.get_long_indexesr   rV   z. examples that are very long. These are rows: zf
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.rJ   z long examplesc                    s8    | }|krt jdt| d| d | |S )NzeThe indices of the long examples has changed as a result of a previously applied recommendation.
The z? long examples to be dropped are now at the following indices: 
)sysstdoutwriter   drop)r=   long_indexes_to_dropre   long_indexesr   r   r      s   
z,long_examples_validator.<locals>.optional_fnrd   r[   )infer_task_typer   r   )r   r
   r   r   ft_typer   rl   r   long_examples_validator   s"   	rp   c                    sl  d}d}d}d}dg d}|D ]}|dkr | j jd r q| j jj|dd r,q| dd}t| }|d	krBtd
dS dd  t| j dd}	| j |	k r`d|	 d}td
|dS |	dkr|	dd}
d|
 d}t	|	dkr~|d| d7 }| j jdt	|	  jj|	dd r|d|	 d7 }nd}|	dkrd| d} fdd}td||||d S )!z
    This validator will suggest to add a common suffix to the prompt if one doesn't already exist in case of classification or conditional generation.
    Nz


### =>

) ->z

###

z

===

z

---

z

===>

z

--->

rq   rf   Fregex\nr`   common_suffixr	   c                 S      | d  |7  < | S Nr6   r   r=   suffixr   r   r   
add_suffix      z2common_prompt_suffix_validator.<locals>.add_suffixrz   xfixzAll prompts are identical: `zt`
Consider leaving the prompts blank if you want to do open-ended generation, otherwise ensure prompts are differentr	   r   r   z 
- All prompts end with suffix `r4   
   R. This suffix seems very long. Consider replacing with a shorter suffix, such as `z5
  WARNING: Some of your prompts contain the suffix `zZ` more than once. We strongly suggest that you review your prompts and add a unique suffixa  
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts emptyzAdd a suffix separator `z` to all promptsc                    r.   r/   r   r<   r{   suggested_suffixr   r   r      r3   z3common_prompt_suffix_validator.<locals>.optional_fncommon_completion_suffixr	   r
   r   r   r   )
r6   r   containsrN   replacern   r   get_common_xfixallr   )r   r   r
   r   r   suffix_optionssuffix_optiondisplay_suggested_suffixro   ru   common_suffix_new_line_handledr   r   r   common_prompt_suffix_validator   s`   


r   c                    s   d}d}d}d}t | jdd  dkrtddS dd	 | j k r(tddS  dkrId
  d}|t k rI|d7 }d  d} fdd}td|||dS )zd
    This validator will suggest to remove a common prefix from the prompt if a long one exist.
       Nprefixr}   r   common_prefixrv   c                 S   s   | d j t|d  | d< | S rx   r   r   )r=   r   r   r   r   remove_common_prefix  s   z<common_prompt_prefix_validator.<locals>.remove_common_prefixz"
- All prompts start with prefix `r4   z. Fine-tuning doesn't require the instruction specifying the task, or a few-shot example scenario. Most of the time you should only add the input data into the prompt, and the desired output into the completionRemove prefix `z` from all promptsc                    s
   |  S r/   r   r<   r   r   r   r   r   (  r3   z3common_prompt_prefix_validator.<locals>.optional_fncommon_prompt_prefixr[   )r   r6   r   r   r   r   MAX_PREFIX_LENr
   r   r   r   r   r   common_prompt_prefix_validator  s,   

r   c                    s   d}t | jdd t dko d dkt |k r tddS dd	 | j k r0tddS d
  d}d  d} fdd}td|||dS )zh
    This validator will suggest to remove a common prefix from the completion if a long one exist.
       r   r}   r    r   rv   c                 S   s2   | d j t|d  | d< |rd| d  | d< | S )Nr7   r   r   )r=   r   	ws_prefixr   r   r   r   >  s   z@common_completion_prefix_validator.<locals>.remove_common_prefixz&
- All completions start with prefix `z_`. Most of the time you should only add the output data into the completion, without any prefixr   z` from all completionsc                    s   |  S r/   r   r<   r   r   r   r   r   r   L  rY   z7common_completion_prefix_validator.<locals>.optional_fncommon_completion_prefixr[   )r   r7   r   r   r   r   r   r   r   "common_completion_prefix_validator3  s"   

r   c                    s^  d}d}d}d}t | }|dks|dkrtddS t| jdd}| j|k r6d| d	| d
}td|dS dg d}|D ]}| jjj|dd rLq>| dd}	dd  |dkr|dd}
d|
 d
}t	|dkrw|d|	 d
7 }| jjdt	|  jj|dd r|d| d7 }nd}|dkrd|	 d} fdd}td||||d S )!z
    This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation.
    Nr`   classificationru   rv   rz   r}   z All completions are identical: `zJ`
Ensure completions are different, otherwise the model will just repeat `r4   r   z [END])	rf   .z ENDz***z+++z&&&z$$$z@@@z%%%Frr   rf   rt   c                 S   rw   Nr7   r   ry   r   r   r   r{   }  r|   z6common_completion_suffix_validator.<locals>.add_suffixr   z$
- All completions end with suffix `r   r   z9
  WARNING: Some of your completions contain the suffix `zU` more than once. We suggest that you review your completions and add a unique endingaH  
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.zAdd a suffix ending `z` to all completionsc                    r.   r/   r   r<   r   r   r   r     r3   z7common_completion_suffix_validator.<locals>.optional_fnr   r   )
rn   r   r   r7   r   r   r   rN   r   r   )r   r   r
   r   r   ro   ru   r   r   r   r   r   r   r   "common_completion_suffix_validatorW  sZ   


r   c                 C   s\   dd }d}d}d}| j jdd  dks | j jd d dkr&d}d}|}td	|||d
S )z
    This validator will suggest to add a space at the start of the completion if it doesn't already exist. This helps with tokenization.
    c                 S   s   | d  dd | d< | S )Nr7   c                 S   s   | d dkr
d|  S d|  S )Nr   r   r   r   r<   r   r   r   rD     s    zLcompletions_space_start_validator.<locals>.add_space_start.<locals>.<lambda>)rM   r<   r   r   r   add_space_start  s   z:completions_space_start_validator.<locals>.add_space_startNra   r   r   z
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detailsz=Add a whitespace character to the beginning of the completioncompletion_space_startr[   )r7   r   nuniquevaluesr   )r   r   r   r   r
   r   r   r   !completions_space_start_validator  s   ,r   c                    sn    fdd}|    dd  }|    dd  }|d |kr5tdd  d	  d
d  d|dS dS )zt
    This validator will suggest to lowercase the column values, if more than a third of letters are uppercase.
    c                    s   |   j  |  < | S r/   r    r<   r%   r   r   
lower_case  s   z(lower_case_validator.<locals>.lower_casec                 S      t dd | D S )Nc                 s   $    | ]}|  r| rd V  qdS ra   N)isalphaisupperr"   r   r   r   	<genexpr>     " 9lower_case_validator.<locals>.<lambda>.<locals>.<genexpr>sumr<   r   r   r   rD         z&lower_case_validator.<locals>.<lambda>c                 S   r   )Nc                 s   r   r   )r   islowerr"   r   r   r   r     r   r   r   r<   r   r   r   rD     r   r8   r   z
- More than a third of your `z%` column/key is uppercase. Uppercase zs tends to perform worse than a mixture of case encountered in normal language. We recommend to lower case the data if that makes sense in your domain. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detailsz'Lowercase all your data in column/key `r4   r[   N)rM   r   r   )r   r&   r   count_uppercount_lowerr   r%   r   lower_case_validator  s"   


r   c              
   C   s  t   d}d}d}d}d}tj| rTz|  ds$|  drI|  dr-dnd\}}d| d}d| d	}tj| |td
	d}n|  drqd}d}t
| }	|	j}
t|
dkrf|d7 }tj| td	d}n|  drd}d}t| d}| }tjdd |dD |td	d}W d   n1 sw   Y  n|  drtj| dtd	d}t|dkrd}d}tj| td	d}na	 n_|  drz"tj| dtd	d}t|dkrtj| td	d}nd }d}W n4 ty   tj| td	d}Y n!w d!}d"| v r)|d#|  d$| d"d%  d&7 }n|d#|  d'7 }W n' ttfyS   | d"d%  }d(|  d)| d*| d+}Y nw d,|  d-}td.|||d/}||fS )0z
    This function will read a file saved in .csv, .json, .txt, .xlsx or .tsv format using pandas.
     - for .xlsx it will read the first sheet
     - for .txt it will assume completions and split on newline
    Nz.csvz.tsv)CSV,)TSV	z=
- Based on your file extension, your file is formatted as a z filezYour format `z` will be converted to `JSONL`)sepdtyper   z.xlsxzH
- Based on your file extension, your file is formatted as an Excel filez/Your format `XLSX` will be converted to `JSONL`ra   z
- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet...)r   z.txtz9
- Based on your file extension, you provided a text filez.Your format `TXT` will be converted to `JSONL`rc                 S   s   g | ]}d |gqS )r   r   )r#   liner   r   r   r'     s    z#read_any_format.<locals>.<listcomp>rf   )r(   r   .jsonlT)linesr   z^
- Your JSONL file appears to be in a JSON format. Your file will be converted to JSONL formatz/Your format `JSON` will be converted to `JSONL`z.jsonz^
- Your JSON file appears to be in a JSONL format. Your file will be converted to JSONL formatz]Your file must have one of the following extensions: .CSV, .TSV, .XLSX, .TXT, .JSON or .JSONLr   z Your file `z` ends with the extension `.z` which is not supported.z` is missing a file extension.zYour file `z!` does not appear to be in valid z9 format. Please ensure your file is formatted as a valid z file.zFile z does not exist.read_any_format)r	   r   r
   r   )r   ospathisfiler!   endswithpdread_csvr   fillna	ExcelFilesheet_namesr   
read_excelopenread	DataFramesplit	read_json
ValueError	TypeErrorupperr   )fnamer:   remediationr   r
   r   r   file_extension_str	separatorxlssheetsfcontentr   r   r   r     s   


"r   c                 C   s,   t | }d}|dkrd| d}td|dS )z
    This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.
    It will also suggest to use ada and explain train/validation split benefits.
    Nr   zK
- Based on your data it seems like you're trying to fine-tune a model for z
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for trainingr   r   )rn   r   )r   ro   r
   r   r   r   format_inferrer_validator7  s
   r   c                 C   sb   |j durtjd|j d|j  d td |jdur%tj|j |jdur/|| } | S )zs
    This function will apply a necessary remediation to a dataframe, or print an error message if one exists.
    Nz

ERROR in z validator: z

Aborting...ra   )	r   rg   stderrri   r	   exitr
   rh   r   )r   r   r   r   r   apply_necessary_remediationC  s   




r   c                 C   s.   t j|  |rt jd dS t  dkS )NzY
Tn)rg   rh   ri   inputr!   )
input_textauto_acceptr   r   r   accept_suggestionS  s
   r   c                 C   s\   d}d|j  d}|j durt||r|| } d}|jdur*tjd|j d | |fS )zc
    This function will apply an optional remediation to a dataframe, based on the user input.
    Fz- [Recommended] z [Y/n]: NTz- [Necessary] rf   )r   r   r   r   rg   rh   ri   )r   r   r   optional_appliedr   r   r   r   apply_optional_remediation[  s   



r   c                 C   sj   t | }d}|dkrt| }|d }n| jdd }|d }dd }||d	 }tjd
| d dS )z?
    Estimate the time it'll take to fine-tune the dataset
    g      ?r   g
ףp=
?T)rQ   g|?5^?c                 S   sd   | dk rt | d dS | dk rt | d d dS | dk r(t | d d dS t | d d dS )	N<   r8   z secondsi  z minutesiQ z hoursz days)round)timer   r   r   format_timew  s   z.estimate_fine_tuning_time.<locals>.format_time   z:Once your model starts training, it'll approximately take z~ to train a `curie` model, and less for `ada` and `babbage`. Queue will approximately take half an hour per job ahead of you.
N)rn   r   memory_usager   rg   rh   ri   )r   	ft_formatexpected_timer   sizer   time_stringr   r   r   estimate_fine_tuning_timej  s   


r   c                    sd   |rddgndg}d}	 |dkrd| dnd fdd	|D }t d
d |D s-|S |d7 }q)N_train_validr   r   Tz ()c                    s,   g | ]}t j d  d |  d qS )r   	_preparedr   )r   r   splitext)r#   rz   r   index_suffixr   r   r'     s    z!get_outfnames.<locals>.<listcomp>c                 s   s    | ]	}t j|V  qd S r/   )r   r   r   )r#   r   r   r   r   r     s    z get_outfnames.<locals>.<genexpr>ra   )rN   )r   r   suffixesicandidate_fnamesr   r   r   get_outfnames  s   r   c                 C   s.   | j  }d }|dkr| j  jd }||fS )Nr8   r   )r7   r   value_countsrQ   )r   	n_classes	pos_classr   r   r   get_classification_hyperparams  s
   
r  c                 C   s~  t | }t| jdd}t| jdd}d}d}|dkr!t||r!d}d}	|dd	}
|dd	}t|d
kr;d| dnd}d}|s\|s\tj	d| d|	 d|
 d| d	 t
|  d*S t||r7t||}|rt|dkr{d|d
 v r{d|d v s}J d}tt| | tt| d }| j|dd}| |j}|ddg j|d
 dddd |ddg j|d dddd t| \}}|	d7 }	|dkr|	d| d7 }	n|	d | 7 }	nt|dksJ | ddg j|d
 dddd |rd!ndd" d#| }|rd$|d  dnd}t|
d
krdnd%|
 d}tj	d&| d'|d
  d| |	 d(| | d t
|  d*S tj	d) d*S )+aQ  
    This function will write out a dataframe to a file, if the user would like to proceed, and also offer a fine-tuning command with the newly created file.
    For classification it will optionally ask the user if they would like to split the data into train/valid files, and modify the suggested command to include the valid set.
    rz   r}   FzQ- [Recommended] Would you like to split into training and validation set? [Y/n]: r   Tr   rf   rt   r   z Make sure to include `stop=["z;"]` so that the generated texts ends at the expected place.z@

Your data will be written to a new JSONL file. Proceed [Y/n]: zK
You can use your file for fine-tuning:
> openai api fine_tunes.create -t ""ue   

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `zX` for the model to start generating completions, rather than continuing with the prompt.r8   trainvalidra   i  g?*   )r   random_stater6   r7   records)r   orientforce_asciiz! --compute_classification_metricsz" --classification_positive_class "z --classification_n_classes rK   z to `z` and `z -v "uc   After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `z
Wrote modified filezd`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "z

z#Aborting... did not write the file
N)rn   r   r6   r7   r   r   r   rg   rh   ri   r   r   maxintsamplerj   rQ   to_jsonr  r]   )r   r   any_remediationsr   r   common_prompt_suffixr   r   r   additional_params%common_prompt_suffix_new_line_handled)common_completion_suffix_new_line_handledoptional_ending_stringfnamesMAX_VALID_EXAMPLESn_traindf_traindf_validr  r  files_stringvalid_stringseparator_reminderr   r   r   write_out_file  sr   

(
(r  c                 C   s>   d}t | jj dkrdS t| j t| | k rdS dS )z>
    Infer the likely fine-tuning task type from the data
       r   r`   r   zconditional generation)r   r6   r   r   r7   unique)r   CLASSIFICATION_THRESHOLDr   r   r   rn     s   rn   rz   c                 C   sn   d}	 |dkr| j t|d  d n
| j dt|d  }| dkr'	 |S ||jd kr1	 |S |jd }q)zQ
    Finds the longest common suffix or prefix of all the values in a series
    r   Trz   ra   Nr   )r   r   r   r   )seriesr~   common_xfixcommon_xfixesr   r   r   r     s   
r   c                   C   s2   t dd dd tttttdd dd tttt	t
gS )Nc                 S   
   t | dS rx   r5   r<   r   r   r   rD        
 z get_validators.<locals>.<lambda>c                 S   r%  r   r&  r<   r   r   r   rD     r'  c                 S   r%  rx   r   r<   r   r   r   rD     r'  c                 S   r%  r   r(  r<   r   r   r   rD     r'  )r   rC   rU   r   r_   rp   r   r   r   r   r   r   r   r   r   get_validators  s    r)  c                 C   s   g }|d ur| | |D ]}|| }|d ur!| | t| |} qtdd |D }tdd |D }	d}
|rPtjd |D ]}t| ||\} }|
pM|}
q@ntjd |
pY|	}|| ||| d S )Nc                 S   s$   g | ]}|j d us|jd ur|qS r/   )r   r   r#   r   r   r   r   r'   6  s    

z$apply_validators.<locals>.<listcomp>c                 S   s   g | ]	}|j d ur|qS r/   )r   r*  r   r   r   r'   >  s
    
Fz?

Based on the analysis we will perform the following actions:
z

No remediations found.
)appendr   rN   rg   rh   ri   r   )r   r   r   
validatorsr   write_out_file_funcoptional_remediations	validator&any_optional_or_necessary_remediationsany_necessary_appliedany_optional_appliedr   !any_optional_or_necessary_appliedr   r   r   apply_validators$  sB   



r4  )r7   )rz   )%r   rg   typingr   r   r   r   openai.datalib.pandas_helperr   r   r   r   r   r5   rC   rU   r_   rp   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  rn   r   r)  r4  r   r   r   r   <module>   s>    
&
(L'$L\M
