Helpers

This module includes methods classified as helpers.

Anonymizer

A collection of methods to help handle sensitive information

pandasai.helpers.anonymizer

This module contains helper functions for anonymizing data and generating random data before sending it to the LLM (An External API). Only df.head() is sent to LLM API, hence the df.head() is processed to remove any personal or sensitive information.

anonymize_dataframe_head(data_frame, force_conversion=True)

Anonymize the head of a given DataFrame by replacing sensitive data.

Parameters:

Name Type Description Default
data_frame pd.DataFrame

The DataFrame to anonymize the head data.

required
force_conversion bool

Convert it with instruction. Default is True.

True
Source code in pandasai/helpers/anonymizer.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def anonymize_dataframe_head(
    data_frame: pd.DataFrame, force_conversion: bool = True
) -> pd.DataFrame:

    """Anonymize the head of a given DataFrame by replacing sensitive data.

    Args:

        data_frame (pd.DataFrame):  The DataFrame to anonymize the head data.
        force_conversion (bool): Convert it with instruction. Default is True.

    Returns: Anonymized head of the DataFrame.
    """

    data_frame = copy_head(data_frame)
    dtypes = data_frame.dtypes
    for col in data_frame.columns:
        col_idx = data_frame.columns.get_loc(col)
        # check category type column and temporarily convert to object type
        if force_conversion:
            if pd.api.types.is_categorical_dtype(data_frame[col]):
                if data_frame[col].isna().any():
                    data_frame[col] = data_frame[col].astype(object)
        for row_idx, val in enumerate(data_frame[col]):
            cell_value = str(val)

            if is_valid_email(cell_value):
                data_frame.iloc[row_idx, col_idx] = generate_random_email()
                continue
            if is_valid_phone_number(cell_value):
                data_frame.iloc[row_idx, col_idx] = generate_random_phone_number(
                    cell_value
                )
                continue
            if is_valid_credit_card(cell_value):
                data_frame.iloc[row_idx, col_idx] = generate_random_credit_card()
                continue

            # anonymize data
            random_row_index = random.choice(
                [i for i in range(len(data_frame.index)) if i != row_idx]
            )
            random_value = data_frame.iloc[random_row_index, col_idx]
            data_frame.iloc[row_idx, col_idx] = random_value
            data_frame.iloc[random_row_index, col_idx] = cell_value
    # restore the original data types
    data_frame = data_frame.astype(dtypes)
    return data_frame

copy_head(data_frame)

Copy the head of a DataFrame.

Parameters:

Name Type Description Default
data_frame pd.DataFrame

The pd.DataFrame to copy the head from.

required

Returns (pd.DataFrame): copied head of the DataFrame.

Source code in pandasai/helpers/anonymizer.py
122
123
124
125
126
127
128
129
130
131
132
def copy_head(data_frame: pd.DataFrame) -> pd.DataFrame:

    """Copy the head of a DataFrame.

    Args:
        data_frame (pd.DataFrame): The pd.DataFrame to copy the head from.

    Returns (pd.DataFrame): copied head of the DataFrame.
    """

    return data_frame.head().copy()

generate_random_credit_card()

Generate a random credit card number.

Returns (str): generated random credit card number.

Source code in pandasai/helpers/anonymizer.py
107
108
109
110
111
112
113
114
115
116
117
118
119
def generate_random_credit_card() -> str:

    """Generate a random credit card number.

    Returns (str): generated random credit card number.
    """

    groups = []
    for _i in range(4):
        group = "".join(random.choices("0123456789", k=4))
        groups.append(group)
    separator = random.choice(["-", " "])
    return separator.join(groups)

generate_random_email()

Generates a random email address using predefined domains.

Returns (str): generated random email address.

Source code in pandasai/helpers/anonymizer.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def generate_random_email() -> str:

    """Generates a random email address using predefined domains.

    Returns (str): generated random email address.
    """

    domains = [
        "gmail.com",
        "yahoo.com",
        "hotmail.com",
        "outlook.com",
        "icloud.com",
        "aol.com",
        "protonmail.com",
        "zoho.com",
    ]
    name_length = random.randint(6, 12)
    domain = random.choice(domains)
    letters = string.ascii_lowercase + string.digits + "-_"
    username = "".join(random.choice(letters) for i in range(name_length))
    email = username + "@" + domain
    return email

generate_random_phone_number(original_field)

Generate a random phone number with country code if originally present.

Parameters:

Name Type Description Default
original_field str

original phone number field.

required

Returns (str): generated random phone number.

Source code in pandasai/helpers/anonymizer.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def generate_random_phone_number(original_field: str) -> str:

    """Generate a random phone number with country code if originally present.

    Args:
        original_field (str): original phone number field.

    Returns (str): generated random phone number.
    """

    if original_field.startswith("+"):
        # Extract country code if present
        country_code = original_field.split()[0]
    else:
        country_code = ""

    number = "".join(random.choices("0123456789", k=10))

    if country_code:
        phone_number = f"{country_code} {number}"
    else:
        phone_number = number

    return phone_number

is_valid_credit_card(credit_card_number)

Check if the given credit card number is valid based on regex pattern.

Parameters:

Name Type Description Default
credit_card_number str

credit card number to be checked.

required

Returns (str): True if the credit card number is valid, otherwise False.

Source code in pandasai/helpers/anonymizer.py
42
43
44
45
46
47
48
49
50
51
52
53
def is_valid_credit_card(credit_card_number: str) -> bool:

    """Check if the given credit card number is valid based on regex pattern.

    Args:
        credit_card_number (str): credit card number to be checked.

    Returns (str): True if the credit card number is valid, otherwise False.
    """

    pattern = r"^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}$"
    return re.search(pattern, credit_card_number) is not None

is_valid_email(email)

Check if the given email is valid based on regex pattern.

Parameters:

Name Type Description Default
email str

email address to be checked.

required

Returns (bool): True if the email is valid, otherwise False.

Source code in pandasai/helpers/anonymizer.py
14
15
16
17
18
19
20
21
22
23
24
25
def is_valid_email(email: str) -> bool:

    """Check if the given email is valid based on regex pattern.

    Args:
        email (str): email address to be checked.

    Returns (bool): True if the email is valid, otherwise False.
    """

    email_regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return re.match(email_regex, email) is not None

is_valid_phone_number(phone_number)

Check if the given phone number is valid based on regex pattern.

Parameters:

Name Type Description Default
phone_number str

phone number to be checked.

required

Returns (bool): True if the phone number is valid, otherwise False.

Source code in pandasai/helpers/anonymizer.py
28
29
30
31
32
33
34
35
36
37
38
39
def is_valid_phone_number(phone_number: str) -> bool:

    """Check if the given phone number is valid based on regex pattern.

    Args:
        phone_number (str): phone number to be checked.

    Returns (bool): True if the phone number is valid, otherwise False.
    """

    pattern = r"\b(?:\+?\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}\b"
    return re.search(pattern, phone_number) is not None

Jupyter Notebook

Helper functions to handle Jupyter Notebook execution feature

pandasai.helpers.notebook

Helper Module to Handle Jupyter Notebook This module contains helper functions to interact with Jupyter Notebook Functionalities.

Notebook

Baseclass to implement Notebook helper functions

Source code in pandasai/helpers/notebook.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
class Notebook:

    """Baseclass to implement Notebook helper functions"""

    def in_notebook(self) -> bool:

        """
        Checks whether the code is running inside a notebook environment.

        Returns (bool): True if the code is running inside a Jupyter notebook, False otherwise.
        """
        try:
            if "IPKernelApp" not in get_ipython().config:
                return False
        except (ImportError, AttributeError):
            return False
        return True

    def create_new_cell(self, contents: str) -> None:

        """
        Creates a new code cell in the Jupyter notebook and populates it with the specified
        contents.
        Args:
            contents (str): The contents to be added to the new code cell.

        ImportError:
            If the IPython module is not installed.

        AttributeError:
            If the 'get_ipython()' call raises an AttributeError, which can happen if the code is
            not running inside a Jupyter notebook.

        Returns: None

        """

        payload = {"source": "set_next_input", "text": contents, "replace": False}
        try:
            get_ipython().payload_manager.write_payload(payload, single=False)
        except (ImportError, AttributeError) as exception:
            raise exception

create_new_cell(contents)

Creates a new code cell in the Jupyter notebook and populates it with the specified contents.

Parameters:

Name Type Description Default
contents str

The contents to be added to the new code cell.

required
ImportError

If the IPython module is not installed.

AttributeError

If the 'get_ipython()' call raises an AttributeError, which can happen if the code is not running inside a Jupyter notebook.

Source code in pandasai/helpers/notebook.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def create_new_cell(self, contents: str) -> None:

    """
    Creates a new code cell in the Jupyter notebook and populates it with the specified
    contents.
    Args:
        contents (str): The contents to be added to the new code cell.

    ImportError:
        If the IPython module is not installed.

    AttributeError:
        If the 'get_ipython()' call raises an AttributeError, which can happen if the code is
        not running inside a Jupyter notebook.

    Returns: None

    """

    payload = {"source": "set_next_input", "text": contents, "replace": False}
    try:
        get_ipython().payload_manager.write_payload(payload, single=False)
    except (ImportError, AttributeError) as exception:
        raise exception

in_notebook()

Checks whether the code is running inside a notebook environment.

Returns (bool): True if the code is running inside a Jupyter notebook, False otherwise.

Source code in pandasai/helpers/notebook.py
13
14
15
16
17
18
19
20
21
22
23
24
25
def in_notebook(self) -> bool:

    """
    Checks whether the code is running inside a notebook environment.

    Returns (bool): True if the code is running inside a Jupyter notebook, False otherwise.
    """
    try:
        if "IPKernelApp" not in get_ipython().config:
            return False
    except (ImportError, AttributeError):
        return False
    return True