v0.0.4 - new params

alessioricco · alessioricco · commit dee9567a1c86 · 2023-05-13T15:28:01.000+01:00
diff --git a/pandas_llm/example-chatbot.py b/pandas_llm/example-chatbot.py
@@ -55,7 +55,7 @@ def main():
         print("No OpenAI API key provided. Exiting.")
         return
 
-    conv_df = PandasLLM(data=df, openai_api_key = openai_key)
+    conv_df = PandasLLM(data=df, llm_api_key = openai_key)
     print()
     banner = """
     Welcome to the Donation Data CLI.
diff --git a/pandas_llm/example.py b/pandas_llm/example.py
@@ -17,7 +17,7 @@
         ('Olivia Jackson', 29, 55)]
 df = pd.DataFrame(data, columns=['name', 'age', 'donation'])
 
-conv_df = PandasLLM(data=df, openai_api_key = os.environ.get("OPENAI_API_KEY"))
+conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY"))
 result = conv_df.prompt("What is the average donation of people older than 30 who donated more than $50?")
 
 print(f"Result ({type(result)}):\n {result}")
diff --git a/pandas_llm/pandas_llm.py b/pandas_llm/pandas_llm.py
@@ -1,14 +1,17 @@
 import pandas as pd
 import datetime
 import numpy as np
-import time
 import openai
 import os
 from sandbox import Sandbox
 import re
 import json
 
 class PandasLLM(pd.DataFrame):
+    """
+    PandasLLM is a subclass of the Pandas DataFrame class. It is designed to provide a
+    wrapper around the OpenAI API. 
+    """
 
     code_blocks = [r'```python(.*?)```',r'```(.*?)```']
 
@@ -17,31 +20,63 @@ class PandasLLM(pd.DataFrame):
     llm_engine = "openai"
     llm_default_params = { "model": llm_default_model,
                             "temperature": llm_default_temperature}
-    openai_api_key = None
+    llm_api_key = None
     
     prompt_override = False
     custom_prompt = ""
     data_privacy = True
     path = None
     verbose = False 
     code_block = ""
+    force_sandbox = False
     def __init__(self, 
-                 data=None, 
-                 llm_engine = "openai", llm_params=llm_default_params, 
-                 prompt_override = False,     
-                 custom_prompt = "", 
-                 path = None,
-                 verbose = False,
-                 data_privacy = True,
-                 openai_api_key = None,
+                 data, 
+                 llm_engine:str = "openai", llm_params=llm_default_params, 
+                 prompt_override:bool = False,     
+                 custom_prompt:str = "", 
+                 path:str = None,
+                 verbose:bool = False,
+                 data_privacy:bool = True,
+                 llm_api_key:str = None,
+                 force_sandbox:bool = False,
                  *args, **kwargs):
+        """
+        This is the constructor for the PandasLLM class. It takes in the following arguments:
+        data: The data to be used. It can be a Pandas DataFrame, a list of lists, a list of tuples,
+        a list of dictionaries, a dictionary, a string, or a list.
+        llm_engine: The name of the OpenAI engine to use.
+        llm_params: A dictionary of parameters to be used with the OpenAI API.
+        prompt_override: A boolean that determines whether or not the prompt is overridden.
+        custom_prompt: A string that overrides the prompt.
+        path: The path to the file to be used.
+        verbose: A boolean that determines whether or not the output is verbose.
+        data_privacy: A boolean that determines whether or not the data is private.
+        llm_api_key: The OpenAI API key to be used.
+        force_sandbox: if False and the sandbox fails, it will retry using eval (less safe)
+
+        The constructor also calls the parent class's constructor.
+
         
+        Args:
+            data (pandas dataframe, mandatory): dataset to query. Defaults to None.
+            llm_engine (str, optional): LLM engine, currently only OpenAI is supported. Defaults to "openai".
+            llm_params (dict, optional): LLM engine parameters. Defaults to model=gpt-3.5-turbo and temperature=0.2".
+            prompt_override (bool, optional): if True, the custom prompt is mandatory and it will became the main prompt. Defaults to False.
+            custom_prompt (str, optional): if prompt_override is False, the custom prompt will be added to the default pandas_llm prompt. Defaults to "".
+            path (str, optional): the path where the files containing debug data will be save. Defaults to None.
+            verbose (bool, optional): if True debugging info will be printed. Defaults to False.
+            data_privacy (bool, optional): if True, the function will not send the data content to OpenAI. Defaults to True.
+            llm_api_key (str, optional): the Open API key. Defaults to None.
+            force_sandbox (bool, optional): if False and the sandbox fails, it will retry using eval (less safe). Defaults to False.
+        """
+
+
         super().__init__(data, *args, **kwargs)
         
         self.llm_params = llm_params or {}
 
         # Set up OpenAI API key from the environment or the config
-        self.openai_api_key = openai_api_key or os.environ.get("OPENAI_API_KEY")
+        self.llm_api_key = llm_api_key or os.environ.get("OPENAI_API_KEY")
 
         self.llm_engine = llm_engine
         self.llm_params = llm_params or {}
@@ -54,8 +89,9 @@ def __init__(self,
         self.data_privacy = data_privacy
         self.path = path
         self.verbose = verbose
+        self.force_sandbox = force_sandbox
 
-    def buildPromptForRole(self):
+    def _buildPromptForRole(self):
         prompt_role = f"""
 I want you to act as a data scientist and Python coder. I want you code for me. 
 I have a dataset of {len(self)} rows and {len(self.columns)} columns.
@@ -68,7 +104,7 @@ def buildPromptForRole(self):
 
         return prompt_role
 
-    def buildPromptForProblemSolving(self, request):
+    def _buildPromptForProblemSolving(self, request):
 
         if self.prompt_override:
             return self.custom_prompt
@@ -105,7 +141,7 @@ def buildPromptForProblemSolving(self, request):
 
         return prompt_problem
 
-    def extractPythonCode(self, text: str, regexp: str) -> str:
+    def _extractPythonCode(self, text: str, regexp: str) -> str:
         # Define the regular expression pattern for the Python code block
         pattern = regexp
         
@@ -119,44 +155,44 @@ def extractPythonCode(self, text: str, regexp: str) -> str:
         # If no match is found, return an empty string
         return ""
 
-    def print(self,  *args, **kwargs):
+    def _print(self,  *args, **kwargs):
         if self.verbose:
             print(*args, **kwargs)
 
-    def variable_to_string(self, variable):
-        if variable is None: return None
-        try:
-
-            if isinstance(variable, pd.Series):
-                # convert to dataframe
-                variable = variable.to_frame()
-
-            if isinstance(variable, pd.DataFrame):
-                variable = variable.drop_duplicates()
-                if len(variable) == 0: return None
-                return str(variable)
-
-            elif isinstance(variable, np.ndarray):
-                if len(variable) == 0: return None
-                return  np.array2string(variable)
-            else:
-                # Convert the variable to a string
-                return str(variable)
-        except Exception as e:
-            return str(variable)
+    # def _variable_to_string(self, variable):
+    #     if variable is None: return None
+    #     try:
+
+    #         if isinstance(variable, pd.Series):
+    #             # convert to dataframe
+    #             variable = variable.to_frame()
+
+    #         if isinstance(variable, pd.DataFrame):
+    #             variable = variable.drop_duplicates()
+    #             if len(variable) == 0: return None
+    #             return str(variable)
+
+    #         elif isinstance(variable, np.ndarray):
+    #             if len(variable) == 0: return None
+    #             return  np.array2string(variable)
+    #         else:
+    #             # Convert the variable to a string
+    #             return str(variable)
+    #     except Exception as e:
+    #         return str(variable)
         
 
-    def save(self,name,value):
+    def _save(self,name,value):
         if self.path is None or self.path == "":
             return  
         try:
             with open(f"{self.path}/{name}", 'w') as file:
                 file.write(value)
         except Exception as e:
-            self.print(f"error {e}")
+            self._print(f"error {e}")
         return
 
-    def execInSandbox(self, df, generated_code:str):
+    def _execInSandbox(self, df, generated_code:str):
 
         # Create a Sandbox instance and allow pandas to be imported
         sandbox = Sandbox()
@@ -175,25 +211,32 @@ def execInSandbox(self, df, generated_code:str):
         # Combine the initial code and the generated code
         full_code = initial_code + "\n" + generated_code
 
-        self.save("temp/prompt_code.py",full_code)
+        self._save("temp/prompt_code.py",full_code)
         # Execute the combined code in the Sandbox
         sandbox_result = sandbox.execute(full_code, {"df":df})
 
         # Get the result from the local_vars dictionary
         result = sandbox_result.get("result")
         return result
 
+    def prompt(self, request: str):
+        """
 
+        Args:
+            request (str): prompt containing the request. it must be expressed as a question or a problem to solve
 
-    def prompt(self, request: str):
+        Returns:
+            Any: contains the result or solution of the problem. Tipically the result data type is a dataframe, a Series or a float
+        """
+        
         # Set up OpenAI API key
-        openai.api_key = self.openai_api_key
+        openai.api_key = self.llm_api_key
 
         messages=[
                 {"role": "system", 
-                "content": self.buildPromptForRole()},
+                "content": self._buildPromptForRole()},
                 {"role": "user", 
-                "content": self.buildPromptForProblemSolving(request)
+                "content": self._buildPromptForProblemSolving(request)
                 }
             ]
 
@@ -207,13 +250,13 @@ def prompt(self, request: str):
                 )
                 break;
             except Exception as e:
-                self.print(f"error {e}")
+                self._print(f"error {e}")
                 continue
 
         if response is None:
             return "Please try later"
 
-        self.save("temp/prompt_cmd.json",json.dumps(messages, indent=4))
+        self._save("temp/prompt_cmd.json",json.dumps(messages, indent=4))
 
         generated_code = response.choices[0].message.content
         if generated_code == "" or generated_code is None:
@@ -224,7 +267,7 @@ def prompt(self, request: str):
 
         results=[]
         for regexp in self.code_blocks:
-            cleaned_code = self.extractPythonCode(generated_code,regexp)
+            cleaned_code = self._extractPythonCode(generated_code,regexp)
             if cleaned_code == "" or cleaned_code is None:
                 continue
             results.append(cleaned_code)
@@ -233,19 +276,20 @@ def prompt(self, request: str):
         if len(results) == 0:
             return None
 
+        result = None
         for cleaned_code in results:
-
-            result = None
+    
             try:
                 result = self.execInSandbox(self, cleaned_code)
             except Exception as e:
-                self.print(f"error {e}")
-                try:
-                    expression = re.sub(r"^\s*result\s*=", "", cleaned_code).strip()
-                    result = eval(expression, {'df': self, 'pd': pd, 'np': np, 'datetime': datetime, 'result': result})
-                except Exception as e:
-                    self.print(f"error {e}")
-                    pass
+                self._print(f"error {e}")
+                if not self.force_sandbox:
+                    try:
+                        expression = re.sub(r"^\s*result\s*=", "", cleaned_code).strip()
+                        result = eval(expression, {'df': self, 'pd': pd, 'np': np, 'datetime': datetime, 'result': result})
+                    except Exception as e:
+                        self._print(f"error {e}")
+                        pass
 
             if result is not None and str(result) != "":
                 break
@@ -257,6 +301,6 @@ def prompt(self, request: str):
         # currently the privacy option is not needed.
         # in the future, we can choose to send data to LLM if privacy is set to false
 
-        return None
+        return result
 
         
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name='pandas_llm',                           # should match the package folder
-    version='0.0.3',                                # important for updates
+    version='0.0.4',                                # important for updates
     license='MIT',                                  # should match your chosen license
     description='Conversational Pandas Dataframes',
     long_description=long_description,              # loads your README.md
@@ -51,5 +51,5 @@
         "urllib3",
         "yarl",
     ],   
-    download_url="https://github.com/DashyDashOrg/pandas-llm/releases/tag/v0.0.3",
+    download_url="https://github.com/DashyDashOrg/pandas-llm/releases/tag/v0.0.4",
 )