Skip to content

Commit 111996c

Browse files
authored
Merge pull request #460 from KhiopsML/454-unconsistence-between-download_datasets-and-get_samples_dir
Make tools.download_datasets consistent with the runner
2 parents ef17036 + e07cbc8 commit 111996c

File tree

6 files changed

+191
-90
lines changed

6 files changed

+191
-90
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
- (`core`) Dictionary API support for dictionary, variable and variable block
1313
comments, and dictionary and variable block internal comments.
1414

15+
### Fixed
16+
- (General) Inconsistency between the `tools.download_datasets` function and the
17+
current samples directory according to `core.api.get_samples_dir()`.
18+
19+
1520
## 11.0.0.0-b.0 - 2025-07-10
1621

1722
### Added

khiops/core/internals/filesystems.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
import json
1010
import os
11-
import platform
1211
import shutil
1312
import warnings
1413
from abc import ABC, abstractmethod
@@ -59,8 +58,11 @@ def is_local_resource(uri_or_path):
5958
`bool`
6059
`True` if a URI refers to a local path
6160
"""
62-
uri_info = urlparse(uri_or_path, allow_fragments=False)
63-
return len(uri_info.scheme) <= 1 or uri_info.scheme == "file"
61+
if (index := uri_or_path.find("://")) > 0:
62+
scheme = uri_or_path[:index]
63+
return len(scheme) == 1 or scheme == "file"
64+
else:
65+
return True
6466

6567

6668
def create_resource(uri_or_path):
@@ -80,15 +82,34 @@ def create_resource(uri_or_path):
8082
`FilesystemResource`
8183
The URI resource object, its class depends on the URI.
8284
"""
83-
uri_info = urlparse(uri_or_path, allow_fragments=False)
84-
if uri_info.scheme == "s3":
85-
return AmazonS3Resource(uri_or_path)
86-
elif uri_info.scheme == "gs":
87-
return GoogleCloudStorageResource(uri_or_path)
88-
elif is_local_resource(uri_or_path):
89-
return LocalFilesystemResource(uri_or_path)
85+
# Case where the URI scheme separator `://` is contained in the uri/path
86+
if (index := uri_or_path.find("://")) > 0:
87+
scheme = uri_or_path[:index]
88+
89+
# Case of normal schemes (those whose scheme is not a single char)
90+
# Note: Any 1-char scheme is considered a Windows path
91+
if len(scheme) > 1:
92+
uri_info = urlparse(uri_or_path, allow_fragments=False)
93+
if uri_info.scheme == "s3":
94+
return AmazonS3Resource(uri_or_path)
95+
elif uri_info.scheme == "gs":
96+
return GoogleCloudStorageResource(uri_or_path)
97+
elif scheme == "file":
98+
# Reject URI if authority is not empty
99+
if uri_info.netloc:
100+
raise ValueError(
101+
f"Non-empty 'authority' in local-path URI '{uri_or_path}': "
102+
f"'{uri_info.netloc}'"
103+
)
104+
return LocalFilesystemResource(uri_or_path)
105+
else:
106+
raise ValueError(f"Unsupported URI scheme '{uri_info.scheme}'")
107+
else:
108+
return LocalFilesystemResource(uri_or_path)
109+
110+
# No scheme separator `://` found: Build a local resource
90111
else:
91-
raise ValueError(f"Unsupported URI scheme {uri_info.scheme}")
112+
return LocalFilesystemResource(uri_or_path)
92113

93114

94115
def parent_path(path):
@@ -411,12 +432,8 @@ def __init__(self, uri):
411432
# Obtain the local from the URI
412433
# Case where the scheme is in fact a windows drive
413434
# => Build the proper path with drive
414-
if (
415-
len(self.uri_info.scheme) == 1
416-
and self.uri_info.scheme.isalpha()
417-
and platform.system() == "Windows"
418-
):
419-
self.path = os.path.join(f"{self.uri_info.scheme}:\\", self.uri_info.path)
435+
if len(self.uri_info.scheme) == 1 and self.uri_info.scheme.isalpha():
436+
self.path = f"{self.uri_info.scheme}:{self.uri_info.path}"
420437
# Case of the "file" scheme
421438
elif self.uri_info.scheme == "file":
422439
# If invalid second colon in path (eg. "/C:/Users"):

khiops/core/internals/runner.py

Lines changed: 29 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import io
1616
import os
17+
import pathlib
1718
import platform
1819
import shlex
1920
import shutil
@@ -49,21 +50,36 @@ def _isdir_without_all_perms(dir_path):
4950
)
5051

5152

53+
def get_default_samples_dir():
54+
"""Returns the default samples directory
55+
56+
The default samples directory is computed according to the following priorities:
57+
- all systems: ``KHIOPS_SAMPLES_DIR/khiops_data/samples`` if set
58+
- Windows:
59+
- ``%PUBLIC%\\khiops_data\\samples`` if ``%PUBLIC%`` is defined
60+
- ``%USERPROFILE%\\khiops_data\\samples`` otherwise
61+
- Linux/macOS: ``$HOME/khiops_data/samples``
62+
"""
63+
if "KHIOPS_SAMPLES_DIR" in os.environ and os.environ["KHIOPS_SAMPLES_DIR"]:
64+
samples_dir = os.environ["KHIOPS_SAMPLES_DIR"]
65+
elif platform.system() == "Windows" and "PUBLIC" in os.environ:
66+
samples_dir = os.path.join(os.environ["PUBLIC"], "khiops_data", "samples")
67+
else:
68+
samples_dir = str(pathlib.Path.home() / "khiops_data" / "samples")
69+
return samples_dir
70+
71+
5272
def _get_dir_status(a_dir):
5373
"""Returns the status of a local or remote directory
5474
5575
Against a local directory a real check is performed. A remote directory is detected
5676
but not checked.
5777
"""
5878
if fs.is_local_resource(a_dir):
59-
# Remove initial slash on windows systems
60-
# urllib's url2pathname does not work properly
6179
a_dir_res = fs.create_resource(os.path.normpath(a_dir))
62-
a_dir_path = a_dir_res.uri_info.path
63-
if platform.system() == "Windows":
64-
if a_dir_path.startswith("/"):
65-
a_dir_path = a_dir_path[1:]
6680

81+
# a_dir_res is a LocalFilesystemResource already
82+
a_dir_path = a_dir_res.path
6783
if not os.path.exists(a_dir_path):
6884
status = "non-existent"
6985
elif not os.path.isdir(a_dir_path):
@@ -98,31 +114,6 @@ def _check_samples_dir(samples_dir):
98114
)
99115

100116

101-
def _extract_path_from_uri(uri):
102-
res = fs.create_resource(uri)
103-
if platform.system() == "Windows":
104-
# Case of file:///<LETTER>:/<REST_OF_PATH>:
105-
# Eliminate first slash ("/") from path if the first component
106-
if (
107-
res.uri_info.scheme == ""
108-
and res.uri_info.path[0] == "/"
109-
and res.uri_info.path[1].isalpha()
110-
and res.uri_info.path[2] == ":"
111-
):
112-
path = res.uri_info.path[1:]
113-
# Case of C:/<REST_OF_PATH>:
114-
# Just use the original path
115-
elif len(res.uri_info.scheme) == 1:
116-
path = uri
117-
# Otherwise return URI path as-is
118-
else:
119-
path = res.uri_info.path
120-
121-
else:
122-
path = res.uri_info.path
123-
return path
124-
125-
126117
def _khiops_env_file_exists(env_dir):
127118
"""Check ``khiops_env`` exists relative to the specified environment dir"""
128119
khiops_env_path = os.path.join(env_dir, "khiops_env")
@@ -399,7 +390,7 @@ def root_temp_dir(self):
399390
def root_temp_dir(self, dir_path):
400391
# Check existence, directory status and permissions for local paths
401392
if fs.is_local_resource(dir_path):
402-
real_dir_path = _extract_path_from_uri(dir_path)
393+
real_dir_path = fs.create_resource(dir_path).path
403394
if os.path.exists(real_dir_path):
404395
if os.path.isfile(real_dir_path):
405396
raise KhiopsEnvironmentError(
@@ -439,7 +430,7 @@ def create_temp_file(self, prefix, suffix):
439430
# Local resource: Effectively create the file with the python file API
440431
if fs.is_local_resource(self.root_temp_dir):
441432
# Extract the path from the potential URI
442-
root_temp_dir_path = _extract_path_from_uri(self.root_temp_dir)
433+
root_temp_dir_path = fs.create_resource(self.root_temp_dir).path
443434

444435
# Create the temporary file
445436
tmp_file_fd, tmp_file_path = tempfile.mkstemp(
@@ -470,7 +461,7 @@ def create_temp_dir(self, prefix):
470461
"""
471462
# Local resource: Effectively create the directory with the python file API
472463
if fs.is_local_resource(self.root_temp_dir):
473-
root_temp_dir_path = _extract_path_from_uri(self.root_temp_dir)
464+
root_temp_dir_path = fs.create_resource(self.root_temp_dir).path
474465
temp_dir = tempfile.mkdtemp(prefix=prefix, dir=root_temp_dir_path)
475466
# Remote resource: Just return a highly probable unique path
476467
else:
@@ -919,7 +910,7 @@ class KhiopsLocalRunner(KhiopsRunner):
919910
920911
- Windows:
921912
922-
- ``%PUBLIC%\khiops_data\samples%`` if it exists and is a directory
913+
- ``%PUBLIC%\khiops_data\samples%`` if ``%PUBLIC%`` is defined
923914
- ``%USERPROFILE%\khiops_data\samples%`` otherwise
924915
925916
- Linux and macOS:
@@ -1029,38 +1020,9 @@ def _initialize_khiops_environment(self):
10291020

10301021
def _initialize_default_samples_dir(self):
10311022
"""See class docstring"""
1032-
# Set the fallback value for the samples directory
1033-
home_samples_dir = Path.home() / "khiops_data" / "samples"
1034-
1035-
# Take the value of an environment variable in priority
1036-
if "KHIOPS_SAMPLES_DIR" in os.environ:
1037-
self._samples_dir = os.environ["KHIOPS_SAMPLES_DIR"]
1038-
1039-
# The samples location of Windows systems is:
1040-
# - %PUBLIC%\khiops_data\samples if %PUBLIC% exists
1041-
# - %USERPROFILE%\khiops_data\samples otherwise
1042-
elif platform.system() == "Windows":
1043-
if "PUBLIC" in os.environ:
1044-
public_samples_dir = os.path.join(
1045-
os.environ["PUBLIC"], "khiops_data", "samples"
1046-
)
1047-
else:
1048-
public_samples_dir = None
1049-
1050-
ok_statuses = ["ok", "remote"]
1051-
if (
1052-
public_samples_dir is not None
1053-
and _get_dir_status(public_samples_dir) in ok_statuses
1054-
):
1055-
self._samples_dir = public_samples_dir
1056-
else:
1057-
self._samples_dir = str(home_samples_dir)
1058-
1059-
# The default samples location on Unix systems is:
1060-
# $HOME/khiops/samples on Linux and Mac OS
1061-
else:
1062-
self._samples_dir = str(home_samples_dir)
1063-
1023+
samples_dir = get_default_samples_dir()
1024+
_check_samples_dir(samples_dir)
1025+
self._samples_dir = samples_dir
10641026
assert self._samples_dir is not None
10651027

10661028
def _check_tools(self):

khiops/tools.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import zipfile
2222

2323
import khiops.core as kh
24+
from khiops.core.internals.runner import get_default_samples_dir
2425
from khiops.samples import samples as samples_core
2526

2627
# We deactivate the warnings to not show a deprecation warning from sklearn
@@ -115,7 +116,11 @@ def download_datasets(
115116
"""Downloads the Khiops sample datasets for a given version
116117
117118
The datasets are downloaded to:
118-
- Windows: ``%USERPROFILE%\\khiops_data\\samples``
119+
- all systems: ``KHIOPS_SAMPLES_DIR/khiops_data/samples`` if
120+
``KHIOPS_SAMPLES_DIR`` is defined and non-empty
121+
- Windows:
122+
- ``%PUBLIC%\\khiops_data\\samples`` if ``%PUBLIC%`` is defined
123+
- ``%USERPROFILE%\\khiops_data\\samples`` otherwise
119124
- Linux/macOS: ``$HOME/khiops_data/samples``
120125
121126
Parameters
@@ -126,10 +131,8 @@ def download_datasets(
126131
The version of the samples datasets.
127132
"""
128133
# Note: The hidden parameter _called_from_shell is just to change the user messages.
129-
130-
# Check if the home sample dataset location is available and build it if necessary
131-
samples_dir = pathlib.Path.home() / "khiops_data" / "samples"
132-
if samples_dir.exists() and not force_overwrite:
134+
samples_dir = get_default_samples_dir()
135+
if os.path.exists(samples_dir) and not force_overwrite:
133136
if _called_from_shell:
134137
instructions = "Execute with '--force-overwrite' to overwrite it"
135138
else:
@@ -140,7 +143,7 @@ def download_datasets(
140143
)
141144
else:
142145
# Create the samples dataset directory
143-
if samples_dir.exists():
146+
if os.path.exists(samples_dir):
144147
shutil.rmtree(samples_dir)
145148
os.makedirs(samples_dir, exist_ok=True)
146149

tests/test_dataset_class.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or #
55
# see the "LICENSE.md" file for more details. #
66
######################################################################################
7+
"""Test the expected behavior of the Dataset class"""
78
import os
89
import shutil
910
import unittest

0 commit comments

Comments
 (0)