atoti.CsvLoad#

final class atoti.CsvLoad#

The description of a CSV file load.

Example

>>> import csv
>>> with open(file_path, "w") as csv_file:
...     writer = csv.writer(csv_file)
...     writer.writerows(
...         [
...             ("city", "area", "country", "population"),
...             ("Tokyo", "Kantō", "Japan", 14_094_034),
...             ("Johannesburg", "Gauteng", "South Africa", 4_803_262),
...             (
...                 "Barcelona",
...                 "Community of Madrid",
...                 "Madrid",
...                 3_223_334,
...             ),
...         ]
...     )

Using columns to drop the population column and rename and reorder the remaining ones:

>>> csv_load = tt.CsvLoad(
...     file_path,
...     columns={"city": "City", "area": "Region", "country": "Country"},
... )
>>> session.tables.infer_data_types(csv_load)
{'City': 'String', 'Region': 'String', 'Country': 'String'}

Creating a table and loading data into it from a headerless CSV file:

>>> with open(file_path, "w") as csv_file:
...     writer = csv.writer(csv_file)
...     writer.writerows(
...         [
...             ("Tokyo", "Kantō", "Japan", 14_094_034),
...             ("Johannesburg", "Gauteng", "South Africa", 4_803_262),
...             (
...                 "Madrid",
...                 "Community of Madrid",
...                 "Spain",
...                 3_223_334,
...             ),
...         ]
...     )
>>> csv_load = tt.CsvLoad(
...     file_path,
...     columns=["City", "Area", "Country", "Population"],
... )
>>> data_types = session.tables.infer_data_types(csv_load)
>>> data_types
{'City': 'String', 'Area': 'String', 'Country': 'String', 'Population': 'int'}
>>> table = session.create_table(
...     "Columns example",
...     data_types=data_types,
...     keys={"Country"},
... )
>>> table.load(csv_load)
>>> table.head().sort_index()
                      City                 Area  Population
Country
Japan                Tokyo                Kantō    14094034
South Africa  Johannesburg              Gauteng     4803262
Spain               Madrid  Community of Madrid     3223334

true_values and false_values default behavior is to only parse "True" and "true" has True and "False" and "false" as False:

>>> with open(file_path, "w") as csv_file:
...     writer = csv.writer(csv_file)
...     writer.writerows(
...         [
...             (
...                 "ID",
...                 "No & Yes",
...                 "no & yes (lower case)",
...                 "False & True",
...                 "false & true (lower case)",
...                 "0 & 1",
...             ),
...             ("abc", "No", "no", "False", "false", 0),
...             ("def", "Yes", "yes", "True", "true", 1),
...             ("ghi", "", "", "", "", ""),
...         ]
...     )
>>> csv_load = tt.CsvLoad(file_path)
>>> data_types = session.tables.infer_data_types(csv_load)
>>> data_types
{'ID': 'String', 'No & Yes': 'String', 'no & yes (lower case)': 'String', 'False & True': 'boolean', 'false & true (lower case)': 'boolean', '0 & 1': 'int'}
>>> table = session.create_table(
...     "Default true_values and false_values example",
...     data_types=data_types,
...     keys={"ID"},
... )
>>> table.load(csv_load)

Missing values in “boolean” columns become False as shown in atoti.Column.default_value:

>>> table.head().sort_index()
    No & Yes no & yes (lower case)  False & True  false & true (lower case)  0 & 1
ID
abc       No                    no         False                      False      0
def      Yes                   yes          True                       True      1
ghi      N/A                   N/A         False                      False   <NA>

Extra values can be specified in addition to the case insensitive "true" or "false":

>>> csv_load = tt.CsvLoad(
...     file_path,
...     false_values={"No", "0"},
...     true_values={"Yes", "1"},
... )
>>> data_types = session.tables.infer_data_types(csv_load)
>>> data_types
{'ID': 'String', 'No & Yes': 'boolean', 'no & yes (lower case)': 'String', 'False & True': 'boolean', 'false & true (lower case)': 'boolean', '0 & 1': 'boolean'}
>>> table = session.create_table(
...     "Custom true_values and false_values example",
...     data_types=data_types,
...     keys={"ID"},
... )
>>> table.load(csv_load)
>>> result = table.head().sort_index()
>>> result
     No & Yes no & yes (lower case)  False & True  false & true (lower case)  0 & 1
ID
abc     False                    no         False                      False  False
def      True                   yes          True                       True   True
ghi     False                   N/A         False                      False  False