1
1
import pandas as pd
2
2
import datetime
3
3
import numpy as np
4
- import time
5
4
import openai
6
5
import os
7
6
from sandbox import Sandbox
8
7
import re
9
8
import json
10
9
11
10
class PandasLLM (pd .DataFrame ):
11
+ """
12
+ PandasLLM is a subclass of the Pandas DataFrame class. It is designed to provide a
13
+ wrapper around the OpenAI API.
14
+ """
12
15
13
16
code_blocks = [r'```python(.*?)```' ,r'```(.*?)```' ]
14
17
@@ -17,31 +20,63 @@ class PandasLLM(pd.DataFrame):
17
20
llm_engine = "openai"
18
21
llm_default_params = { "model" : llm_default_model ,
19
22
"temperature" : llm_default_temperature }
20
- openai_api_key = None
23
+ llm_api_key = None
21
24
22
25
prompt_override = False
23
26
custom_prompt = ""
24
27
data_privacy = True
25
28
path = None
26
29
verbose = False
27
30
code_block = ""
31
+ force_sandbox = False
28
32
def __init__ (self ,
29
- data = None ,
30
- llm_engine = "openai" , llm_params = llm_default_params ,
31
- prompt_override = False ,
32
- custom_prompt = "" ,
33
- path = None ,
34
- verbose = False ,
35
- data_privacy = True ,
36
- openai_api_key = None ,
33
+ data ,
34
+ llm_engine :str = "openai" , llm_params = llm_default_params ,
35
+ prompt_override :bool = False ,
36
+ custom_prompt :str = "" ,
37
+ path :str = None ,
38
+ verbose :bool = False ,
39
+ data_privacy :bool = True ,
40
+ llm_api_key :str = None ,
41
+ force_sandbox :bool = False ,
37
42
* args , ** kwargs ):
43
+ """
44
+ This is the constructor for the PandasLLM class. It takes in the following arguments:
45
+ data: The data to be used. It can be a Pandas DataFrame, a list of lists, a list of tuples,
46
+ a list of dictionaries, a dictionary, a string, or a list.
47
+ llm_engine: The name of the OpenAI engine to use.
48
+ llm_params: A dictionary of parameters to be used with the OpenAI API.
49
+ prompt_override: A boolean that determines whether or not the prompt is overridden.
50
+ custom_prompt: A string that overrides the prompt.
51
+ path: The path to the file to be used.
52
+ verbose: A boolean that determines whether or not the output is verbose.
53
+ data_privacy: A boolean that determines whether or not the data is private.
54
+ llm_api_key: The OpenAI API key to be used.
55
+ force_sandbox: if False and the sandbox fails, it will retry using eval (less safe)
56
+
57
+ The constructor also calls the parent class's constructor.
58
+
38
59
60
+ Args:
61
+ data (pandas dataframe, mandatory): dataset to query. Defaults to None.
62
+ llm_engine (str, optional): LLM engine, currently only OpenAI is supported. Defaults to "openai".
63
+ llm_params (dict, optional): LLM engine parameters. Defaults to model=gpt-3.5-turbo and temperature=0.2".
64
+ prompt_override (bool, optional): if True, the custom prompt is mandatory and it will became the main prompt. Defaults to False.
65
+ custom_prompt (str, optional): if prompt_override is False, the custom prompt will be added to the default pandas_llm prompt. Defaults to "".
66
+ path (str, optional): the path where the files containing debug data will be save. Defaults to None.
67
+ verbose (bool, optional): if True debugging info will be printed. Defaults to False.
68
+ data_privacy (bool, optional): if True, the function will not send the data content to OpenAI. Defaults to True.
69
+ llm_api_key (str, optional): the Open API key. Defaults to None.
70
+ force_sandbox (bool, optional): if False and the sandbox fails, it will retry using eval (less safe). Defaults to False.
71
+ """
72
+
73
+
39
74
super ().__init__ (data , * args , ** kwargs )
40
75
41
76
self .llm_params = llm_params or {}
42
77
43
78
# Set up OpenAI API key from the environment or the config
44
- self .openai_api_key = openai_api_key or os .environ .get ("OPENAI_API_KEY" )
79
+ self .llm_api_key = llm_api_key or os .environ .get ("OPENAI_API_KEY" )
45
80
46
81
self .llm_engine = llm_engine
47
82
self .llm_params = llm_params or {}
@@ -54,8 +89,9 @@ def __init__(self,
54
89
self .data_privacy = data_privacy
55
90
self .path = path
56
91
self .verbose = verbose
92
+ self .force_sandbox = force_sandbox
57
93
58
- def buildPromptForRole (self ):
94
+ def _buildPromptForRole (self ):
59
95
prompt_role = f"""
60
96
I want you to act as a data scientist and Python coder. I want you code for me.
61
97
I have a dataset of { len (self )} rows and { len (self .columns )} columns.
@@ -68,7 +104,7 @@ def buildPromptForRole(self):
68
104
69
105
return prompt_role
70
106
71
- def buildPromptForProblemSolving (self , request ):
107
+ def _buildPromptForProblemSolving (self , request ):
72
108
73
109
if self .prompt_override :
74
110
return self .custom_prompt
@@ -105,7 +141,7 @@ def buildPromptForProblemSolving(self, request):
105
141
106
142
return prompt_problem
107
143
108
- def extractPythonCode (self , text : str , regexp : str ) -> str :
144
+ def _extractPythonCode (self , text : str , regexp : str ) -> str :
109
145
# Define the regular expression pattern for the Python code block
110
146
pattern = regexp
111
147
@@ -119,44 +155,44 @@ def extractPythonCode(self, text: str, regexp: str) -> str:
119
155
# If no match is found, return an empty string
120
156
return ""
121
157
122
- def print (self , * args , ** kwargs ):
158
+ def _print (self , * args , ** kwargs ):
123
159
if self .verbose :
124
160
print (* args , ** kwargs )
125
161
126
- def variable_to_string (self , variable ):
127
- if variable is None : return None
128
- try :
129
-
130
- if isinstance (variable , pd .Series ):
131
- # convert to dataframe
132
- variable = variable .to_frame ()
133
-
134
- if isinstance (variable , pd .DataFrame ):
135
- variable = variable .drop_duplicates ()
136
- if len (variable ) == 0 : return None
137
- return str (variable )
138
-
139
- elif isinstance (variable , np .ndarray ):
140
- if len (variable ) == 0 : return None
141
- return np .array2string (variable )
142
- else :
143
- # Convert the variable to a string
144
- return str (variable )
145
- except Exception as e :
146
- return str (variable )
162
+ # def _variable_to_string (self, variable):
163
+ # if variable is None: return None
164
+ # try:
165
+
166
+ # if isinstance(variable, pd.Series):
167
+ # # convert to dataframe
168
+ # variable = variable.to_frame()
169
+
170
+ # if isinstance(variable, pd.DataFrame):
171
+ # variable = variable.drop_duplicates()
172
+ # if len(variable) == 0: return None
173
+ # return str(variable)
174
+
175
+ # elif isinstance(variable, np.ndarray):
176
+ # if len(variable) == 0: return None
177
+ # return np.array2string(variable)
178
+ # else:
179
+ # # Convert the variable to a string
180
+ # return str(variable)
181
+ # except Exception as e:
182
+ # return str(variable)
147
183
148
184
149
- def save (self ,name ,value ):
185
+ def _save (self ,name ,value ):
150
186
if self .path is None or self .path == "" :
151
187
return
152
188
try :
153
189
with open (f"{ self .path } /{ name } " , 'w' ) as file :
154
190
file .write (value )
155
191
except Exception as e :
156
- self .print (f"error { e } " )
192
+ self ._print (f"error { e } " )
157
193
return
158
194
159
- def execInSandbox (self , df , generated_code :str ):
195
+ def _execInSandbox (self , df , generated_code :str ):
160
196
161
197
# Create a Sandbox instance and allow pandas to be imported
162
198
sandbox = Sandbox ()
@@ -175,25 +211,32 @@ def execInSandbox(self, df, generated_code:str):
175
211
# Combine the initial code and the generated code
176
212
full_code = initial_code + "\n " + generated_code
177
213
178
- self .save ("temp/prompt_code.py" ,full_code )
214
+ self ._save ("temp/prompt_code.py" ,full_code )
179
215
# Execute the combined code in the Sandbox
180
216
sandbox_result = sandbox .execute (full_code , {"df" :df })
181
217
182
218
# Get the result from the local_vars dictionary
183
219
result = sandbox_result .get ("result" )
184
220
return result
185
221
222
+ def prompt (self , request : str ):
223
+ """
186
224
225
+ Args:
226
+ request (str): prompt containing the request. it must be expressed as a question or a problem to solve
187
227
188
- def prompt (self , request : str ):
228
+ Returns:
229
+ Any: contains the result or solution of the problem. Tipically the result data type is a dataframe, a Series or a float
230
+ """
231
+
189
232
# Set up OpenAI API key
190
- openai .api_key = self .openai_api_key
233
+ openai .api_key = self .llm_api_key
191
234
192
235
messages = [
193
236
{"role" : "system" ,
194
- "content" : self .buildPromptForRole ()},
237
+ "content" : self ._buildPromptForRole ()},
195
238
{"role" : "user" ,
196
- "content" : self .buildPromptForProblemSolving (request )
239
+ "content" : self ._buildPromptForProblemSolving (request )
197
240
}
198
241
]
199
242
@@ -207,13 +250,13 @@ def prompt(self, request: str):
207
250
)
208
251
break ;
209
252
except Exception as e :
210
- self .print (f"error { e } " )
253
+ self ._print (f"error { e } " )
211
254
continue
212
255
213
256
if response is None :
214
257
return "Please try later"
215
258
216
- self .save ("temp/prompt_cmd.json" ,json .dumps (messages , indent = 4 ))
259
+ self ._save ("temp/prompt_cmd.json" ,json .dumps (messages , indent = 4 ))
217
260
218
261
generated_code = response .choices [0 ].message .content
219
262
if generated_code == "" or generated_code is None :
@@ -224,7 +267,7 @@ def prompt(self, request: str):
224
267
225
268
results = []
226
269
for regexp in self .code_blocks :
227
- cleaned_code = self .extractPythonCode (generated_code ,regexp )
270
+ cleaned_code = self ._extractPythonCode (generated_code ,regexp )
228
271
if cleaned_code == "" or cleaned_code is None :
229
272
continue
230
273
results .append (cleaned_code )
@@ -233,19 +276,20 @@ def prompt(self, request: str):
233
276
if len (results ) == 0 :
234
277
return None
235
278
279
+ result = None
236
280
for cleaned_code in results :
237
-
238
- result = None
281
+
239
282
try :
240
283
result = self .execInSandbox (self , cleaned_code )
241
284
except Exception as e :
242
- self .print (f"error { e } " )
243
- try :
244
- expression = re .sub (r"^\s*result\s*=" , "" , cleaned_code ).strip ()
245
- result = eval (expression , {'df' : self , 'pd' : pd , 'np' : np , 'datetime' : datetime , 'result' : result })
246
- except Exception as e :
247
- self .print (f"error { e } " )
248
- pass
285
+ self ._print (f"error { e } " )
286
+ if not self .force_sandbox :
287
+ try :
288
+ expression = re .sub (r"^\s*result\s*=" , "" , cleaned_code ).strip ()
289
+ result = eval (expression , {'df' : self , 'pd' : pd , 'np' : np , 'datetime' : datetime , 'result' : result })
290
+ except Exception as e :
291
+ self ._print (f"error { e } " )
292
+ pass
249
293
250
294
if result is not None and str (result ) != "" :
251
295
break
@@ -257,6 +301,6 @@ def prompt(self, request: str):
257
301
# currently the privacy option is not needed.
258
302
# in the future, we can choose to send data to LLM if privacy is set to false
259
303
260
- return None
304
+ return result
261
305
262
306
0 commit comments