Merge pull request #2652 from Avaiga/feature/2603-separator-in-csv-dn

trgiangdo · web-flow · commit bfdd4923b0fe · 2025-05-27T20:50:40.000+07:00
Feature/#2603 - Add separator property to CSVDataNode
diff --git a/taipy/common/config/config.pyi b/taipy/common/config/config.pyi
@@ -424,6 +424,7 @@ class Config:
         default_path: Optional[str] = None,
         encoding: Optional[str] = None,
         has_header: Optional[bool] = None,
+        separator: Optional[str] = None,
         exposed_type: Optional[str] = None,
         scope: Optional[Scope] = None,
         validity_period: Optional[timedelta] = None,
@@ -436,6 +437,7 @@ class Config:
             default_path (Optional[str]): The default path of the CSV file.
             encoding (Optional[str]): The encoding of the CSV file.
             has_header (Optional[bool]): If True, indicates that the CSV file has a header.
+            separator (Optional[str]): The character used to separate values in the CSV file.
             exposed_type (Optional[str]): The exposed type of the data read from CSV file.<br/>
                 The default value is `pandas`.
             scope (Optional[Scope^]): The scope of the CSV data node configuration.<br/>
diff --git a/taipy/core/config/data_node_config.py b/taipy/core/config/data_node_config.py
@@ -97,6 +97,7 @@ class DataNodeConfig(Section):
     _OPTIONAL_EXPOSED_TYPE_CSV_PROPERTY = "exposed_type"
     _OPTIONAL_DEFAULT_PATH_CSV_PROPERTY = "default_path"
     _OPTIONAL_HAS_HEADER_CSV_PROPERTY = "has_header"
+    _OPTIONAL_SEPARATOR_CSV_PROPERTY = "separator"
     # Excel
     _OPTIONAL_EXPOSED_TYPE_EXCEL_PROPERTY = "exposed_type"
     _OPTIONAL_DEFAULT_PATH_EXCEL_PROPERTY = "default_path"
@@ -198,6 +199,7 @@ class DataNodeConfig(Section):
             _OPTIONAL_DEFAULT_PATH_CSV_PROPERTY: str,
             _OPTIONAL_ENCODING_PROPERTY: str,
             _OPTIONAL_HAS_HEADER_CSV_PROPERTY: bool,
+            _OPTIONAL_SEPARATOR_CSV_PROPERTY: str,
             _OPTIONAL_EXPOSED_TYPE_CSV_PROPERTY: (str, Callable),
         },
         _STORAGE_TYPE_VALUE_EXCEL: {
@@ -290,6 +292,7 @@ class DataNodeConfig(Section):
             _OPTIONAL_DEFAULT_PATH_CSV_PROPERTY: None,
             _OPTIONAL_ENCODING_PROPERTY: _DEFAULT_ENCODING_VALUE,
             _OPTIONAL_HAS_HEADER_CSV_PROPERTY: True,
+            _OPTIONAL_SEPARATOR_CSV_PROPERTY: ",",
             _OPTIONAL_EXPOSED_TYPE_CSV_PROPERTY: _DEFAULT_EXPOSED_TYPE,
         },
         _STORAGE_TYPE_VALUE_EXCEL: {
@@ -622,6 +625,7 @@ def _configure_csv(
         default_path: Optional[str] = None,
         encoding: Optional[str] = None,
         has_header: Optional[bool] = None,
+        separator: Optional[str] = None,
         exposed_type: Optional[str] = None,
         scope: Optional[Scope] = None,
         validity_period: Optional[timedelta] = None,
@@ -634,6 +638,7 @@ def _configure_csv(
             default_path (Optional[str]): The default path of the CSV file.
             encoding (Optional[str]): The encoding of the CSV file.
             has_header (Optional[bool]): If True, indicates that the CSV file has a header.
+            separator (Optional[str]): The character used to separate values in the CSV file.
             exposed_type (Optional[str]): The exposed type of the data read from CSV file.<br/>
                 The default value is `pandas`.
             scope (Optional[Scope^]): The scope of the CSV data node configuration.<br/>
@@ -655,6 +660,8 @@ def _configure_csv(
             properties[cls._OPTIONAL_ENCODING_PROPERTY] = encoding
         if has_header is not None:
             properties[cls._OPTIONAL_HAS_HEADER_CSV_PROPERTY] = has_header
+        if separator is not None:
+            properties[cls._OPTIONAL_SEPARATOR_CSV_PROPERTY] = separator
         if exposed_type is not None:
             properties[cls._OPTIONAL_EXPOSED_TYPE_CSV_PROPERTY] = exposed_type
 
diff --git a/taipy/core/data/csv.py b/taipy/core/data/csv.py
@@ -36,11 +36,15 @@ class CSVDataNode(DataNode, _FileDataNodeMixin, _TabularDataNodeMixin):
     - *default_data*: The default data of the data node. It is used at the data node instantiation
         to write the data to the CSV file.
     - *has_header* (`bool`): If True, indicates that the CSV file has a header.
+    - *separator* (`str`): The separator used in the CSV file. The default value is `,`.
     - *exposed_type*: The exposed type of the data read from CSV file. The default value is `pandas`.
     """
 
     __STORAGE_TYPE = "csv"
-    __ENCODING_KEY = "encoding"
+    _ENCODING_KEY = "encoding"
+    _DEFAULT_ENCODING_VALUE = "utf-8"
+    _SEPARATOR_KEY = "separator"
+    _DEFAULT_SEPARATOR_VALUE = ","
 
     _REQUIRED_PROPERTIES: List[str] = []
 
@@ -65,12 +69,15 @@ def __init__(
         if properties is None:
             properties = {}
 
-        if self.__ENCODING_KEY not in properties.keys():
-            properties[self.__ENCODING_KEY] = "utf-8"
+        if self._ENCODING_KEY not in properties.keys():
+            properties[self._ENCODING_KEY] = self._DEFAULT_ENCODING_VALUE
 
         if self._HAS_HEADER_PROPERTY not in properties.keys():
             properties[self._HAS_HEADER_PROPERTY] = True
 
+        if self._SEPARATOR_KEY not in properties.keys():
+            properties[self._SEPARATOR_KEY] = self._DEFAULT_SEPARATOR_VALUE
+
         properties[self._EXPOSED_TYPE_PROPERTY] = _TabularDataNodeMixin._get_valid_exposed_type(properties)
         self._check_exposed_type(properties[self._EXPOSED_TYPE_PROPERTY])
 
@@ -106,7 +113,8 @@ def __init__(
                 self._IS_GENERATED_KEY,
                 self._HAS_HEADER_PROPERTY,
                 self._EXPOSED_TYPE_PROPERTY,
-                self.__ENCODING_KEY,
+                self._ENCODING_KEY,
+                self._SEPARATOR_KEY,
             }
         )
 
@@ -141,12 +149,12 @@ def _read_from_path(self, path: Optional[str] = None, **read_kwargs) -> Any:
 
     def _read_as(self, path: str):
         properties = self.properties
-        with open(path, encoding=properties[self.__ENCODING_KEY]) as csvFile:
+        with open(path, encoding=properties[self._ENCODING_KEY]) as csvFile:
             if properties[self._HAS_HEADER_PROPERTY]:
-                reader_with_header = csv.DictReader(csvFile)
+                reader_with_header = csv.DictReader(csvFile, delimiter=properties[self._SEPARATOR_KEY])
                 return [self._decoder(line) for line in reader_with_header]
 
-            reader_without_header = csv.reader(csvFile)
+            reader_without_header = csv.reader(csvFile, delimiter=properties[self._SEPARATOR_KEY])
             return [self._decoder(line) for line in reader_without_header]
 
     def _read_as_numpy(self, path: str) -> np.ndarray:
@@ -162,20 +170,37 @@ def _read_as_pandas_dataframe(
             properties = self.properties
             if properties[self._HAS_HEADER_PROPERTY]:
                 if column_names:
-                    return pd.read_csv(path, encoding=properties[self.__ENCODING_KEY])[column_names]
-                return pd.read_csv(path, encoding=properties[self.__ENCODING_KEY])
+                    return pd.read_csv(
+                        path, encoding=properties[self._ENCODING_KEY], sep=properties[self._SEPARATOR_KEY]
+                    )[column_names]
+                return pd.read_csv(path, encoding=properties[self._ENCODING_KEY], sep=properties[self._SEPARATOR_KEY])
             else:
                 if usecols:
-                    return pd.read_csv(path, encoding=properties[self.__ENCODING_KEY], header=None, usecols=usecols)
-                return pd.read_csv(path, encoding=properties[self.__ENCODING_KEY], header=None)
+                    return pd.read_csv(
+                        path,
+                        encoding=properties[self._ENCODING_KEY],
+                        sep=properties[self._SEPARATOR_KEY],
+                        header=None,
+                        usecols=usecols,
+                    )
+                return pd.read_csv(
+                    path, encoding=properties[self._ENCODING_KEY], header=None, sep=properties[self._SEPARATOR_KEY]
+                )
         except pd.errors.EmptyDataError:
             return pd.DataFrame()
 
     def _append(self, data: Any):
         properties = self.properties
         exposed_type = properties[self._EXPOSED_TYPE_PROPERTY]
         data = self._convert_data_to_dataframe(exposed_type, data)
-        data.to_csv(self._path, mode="a", index=False, encoding=properties[self.__ENCODING_KEY], header=False)
+        data.to_csv(
+            self._path,
+            mode="a",
+            index=False,
+            encoding=properties[self._ENCODING_KEY],
+            sep=properties[self._SEPARATOR_KEY],
+            header=False,
+        )
 
     def _write(self, data: Any, columns: Optional[List[str]] = None):
         self._write_to_path(self._path, data, columns)
@@ -191,6 +216,7 @@ def _write_to_path(self, path: str, data: Any, columns: Optional[List[str]] = No
         data.to_csv(
             path,
             index=False,
-            encoding=properties[self.__ENCODING_KEY],
+            encoding=properties[self._ENCODING_KEY],
+            sep=properties[self._SEPARATOR_KEY],
             header=properties[self._HAS_HEADER_PROPERTY],
         )
diff --git a/tests/core/config/test_config_serialization.py b/tests/core/config/test_config_serialization.py
@@ -135,6 +135,7 @@ def test_read_write_toml_configuration_file():
 exposed_type = "tests.core.config.test_config_serialization.CustomClass:class"
 encoding = "utf-8"
 has_header = "True:bool"
+separator = ","
 
 [DATA_NODE.test_json_dn]
 storage_type = "json"
@@ -305,7 +306,8 @@ def test_read_write_json_configuration_file():
 "path": "./test.csv",
 "exposed_type": "tests.core.config.test_config_serialization.CustomClass:class",
 "encoding": "utf-8",
-"has_header": "True:bool"
+"has_header": "True:bool",
+"separator": ","
 },
 "test_json_dn": {
 "storage_type": "json",
diff --git a/tests/core/config/test_configure_default_config.py b/tests/core/config/test_configure_default_config.py
@@ -66,7 +66,7 @@ def test_set_default_data_node_configuration_replace_old_default_config():
     )
     dn2 = Config.configure_data_node(id="dn2")
     assert dn2.storage_type == "csv"
-    assert len(dn2.properties) == 6  # encoding, exposed_type, and has_header too
+    assert len(dn2.properties) == 7  # encoding, separator, exposed_type, and has_header
     assert dn2.prop4 == "4"
     assert dn2.prop5 == "5"
     assert dn2.prop6 == "6"
@@ -85,7 +85,7 @@ def test_config_storage_type_different_from_default_data_node():
     # Config a datanode with specific "storage_type" different than "pickle"
     # should ignore the default datanode
     csv_dn = Config.configure_data_node(id="csv_dn", storage_type="csv")
-    assert len(csv_dn.properties) == 3  # encoding, exposed_type, and has_header
+    assert len(csv_dn.properties) == 4  # encoding, separator, exposed_type, and has_header
     assert csv_dn.properties.get("custom_property") is None
     assert csv_dn.scope == Scope.SCENARIO
 
diff --git a/tests/core/data/data_sample/example_2.csv b/tests/core/data/data_sample/example_2.csv
@@ -0,0 +1,13 @@
+id;integer;text
+Ibelfu5;584;This is the first row
+h89653fu5;7;This is the second row
+hdds23;275;This is the third row
+q68423;754;This is the fourth row
+qqf8;10;This is the fifth row
+5sqf8;11778;This is the sixth row
+5458;95;This is the seventh row
+569ggg8;466;This is the 8th row
+kus458;635;This is the 9th row
+5kuds458;9;This is the last row
+jEn4a;1001;1st appended line
+4ajeQ;1002;2nd appended line
diff --git a/tests/core/data/test_csv_data_node.py b/tests/core/data/test_csv_data_node.py
@@ -54,7 +54,7 @@ class TestCSVDataNode:
     def test_create(self):
         default_path = "data/node/path"
         csv_dn_config = Config.configure_csv_data_node(
-            id="foo_bar", default_path=default_path, has_header=False, name="super name"
+            id="foo_bar", default_path=default_path, has_header=False, name="super name", separator=";"
         )
         dn = _DataManagerFactory._build_manager()._create(csv_dn_config, None, None)
         assert isinstance(dn, CSVDataNode)
@@ -70,6 +70,7 @@ def test_create(self):
         assert dn.path == default_path
         assert dn.properties["has_header"] is False
         assert dn.properties["exposed_type"] == "pandas"
+        assert dn.properties["separator"] == ";"
 
         csv_dn_config = Config.configure_csv_data_node(
             id="foo", default_path=default_path, has_header=True, exposed_type=MyCustomObject
@@ -79,6 +80,7 @@ def test_create(self):
         assert dn.config_id == "foo"
         assert dn.properties["has_header"] is True
         assert dn.properties["exposed_type"] == MyCustomObject
+        assert dn.properties["separator"] == ","
 
         with pytest.raises(InvalidConfigurationId):
             CSVDataNode(
diff --git a/tests/core/data/test_data_manager.py b/tests/core/data/test_data_manager.py
@@ -109,10 +109,11 @@ def test_create_and_get_csv_data_node(self):
         assert not _DataManager._get(csv_dn.id).is_ready_for_reading
         assert _DataManager._get(csv_dn.id).is_ready_for_reading == csv_dn.is_ready_for_reading
         assert (
-            len(_DataManager._get(csv_dn.id).properties) == 5
-        )  # path, encoding, has_header, exposed_type, is_generated
+            len(_DataManager._get(csv_dn.id).properties) == 6
+        )  # path, encoding, separator, has_header, exposed_type, is_generated
         assert _DataManager._get(csv_dn.id).properties.get("path") == "bar"
         assert _DataManager._get(csv_dn.id).properties.get("encoding") == "utf-8"
+        assert _DataManager._get(csv_dn.id).properties.get("separator") == ","
         assert _DataManager._get(csv_dn.id).properties.get("has_header") is True
         assert _DataManager._get(csv_dn.id).properties.get("exposed_type") == "pandas"
         assert _DataManager._get(csv_dn.id).properties.get("is_generated") is False
@@ -137,7 +138,7 @@ def test_create_and_get_csv_data_node(self):
         assert _DataManager._get(csv_dn).job_ids == csv_dn.job_ids
         assert not _DataManager._get(csv_dn).is_ready_for_reading
         assert _DataManager._get(csv_dn).is_ready_for_reading == csv_dn.is_ready_for_reading
-        assert len(_DataManager._get(csv_dn).properties) == 5  # path, encoding, has_header, exposed_type, is_generated
+        assert len(_DataManager._get(csv_dn).properties) == 6
         assert _DataManager._get(csv_dn).properties.get("path") == "bar"
         assert _DataManager._get(csv_dn).properties.get("encoding") == "utf-8"
         assert _DataManager._get(csv_dn).properties.get("has_header") is True
diff --git a/tests/core/data/test_read_csv_data_node.py b/tests/core/data/test_read_csv_data_node.py
@@ -23,6 +23,7 @@
 from taipy.core.exceptions.exceptions import NoData
 
 csv_file_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
+csv_2_file_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example_2.csv")
 
 
 @dataclasses.dataclass
@@ -117,3 +118,11 @@ def test_read_without_header_custom_exposed_type():
         assert row_pandas[0] == row_custom.id
         assert str(row_pandas[1]) == row_custom.integer
         assert row_pandas[2] == row_custom.text
+
+
+def test_read_with_different_separator():
+    csv_data_node_as_pandas = CSVDataNode(
+        "bar", Scope.SCENARIO, properties={"path": csv_2_file_path, "has_header": True, "separator": ";"}
+    )
+    data_pandas = csv_data_node_as_pandas.read()
+    assert pd.DataFrame.equals(data_pandas, pd.read_csv(csv_2_file_path, sep=";"))
diff --git a/tests/core/data/test_write_csv_data_node.py b/tests/core/data/test_write_csv_data_node.py
@@ -71,6 +71,29 @@ def test_append(csv_file, default_data_frame, content):
     )
 
 
+def test_append_with_different_separator():
+    path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example_2.csv")
+
+    original_content = pd.read_csv(path, sep=";")
+    content = pd.DataFrame(
+        [
+            {"id": "jEn4a", "integer": 1001, "text": "1st appended line"},
+            {"id": "4ajeQ", "integer": 1002, "text": "2nd appended line"},
+        ],
+    )
+
+    csv_dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": path, "separator": ";"})
+    _DataManagerFactory._build_manager()._repository._save(csv_dn)
+
+    csv_dn.append(content)
+    assert_frame_equal(
+        csv_dn.read(),
+        pd.concat([original_content, pd.DataFrame(content, columns=["id", "integer", "text"])]).reset_index(drop=True),
+    )
+    # Reset the file to its original content
+    csv_dn.write(original_content)
+
+
 def test_write_with_header_pandas(tmp_csv_file):
     csv_dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": tmp_csv_file})
     _DataManagerFactory._build_manager()._repository._save(csv_dn)
@@ -199,3 +222,14 @@ def test_write_with_column_names(tmp_csv_file):
     csv_dn.write_with_column_names(data, columns)
     df = pd.DataFrame(data, columns=columns)
     assert pd.DataFrame.equals(df, csv_dn.read())
+
+
+def test_write_with_different_separator(tmp_csv_file):
+    data = [[11, 22, 33], [44, 55, 66]]
+    columns = ["e", "f", "g"]
+
+    csv_dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": tmp_csv_file, "separator": ";"})
+    _DataManagerFactory._build_manager()._repository._save(csv_dn)
+    csv_dn.write_with_column_names(data, columns)
+    df = pd.DataFrame(data, columns=columns)
+    assert pd.DataFrame.equals(df, csv_dn.read())