Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
- (`core`) Dictionary API support for dictionary, variable and variable block
comments, and dictionary and variable block internal comments.

### Fixed
- (General) Inconsistency between the `tools.download_datasets` function and the
current samples directory according to `core.api.get_samples_dir()`.


## 11.0.0.0-b.0 - 2025-07-10

### Added
Expand Down
51 changes: 34 additions & 17 deletions khiops/core/internals/filesystems.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import json
import os
import platform
import shutil
import warnings
from abc import ABC, abstractmethod
Expand Down Expand Up @@ -59,8 +58,11 @@ def is_local_resource(uri_or_path):
`bool`
`True` if a URI refers to a local path
"""
uri_info = urlparse(uri_or_path, allow_fragments=False)
return len(uri_info.scheme) <= 1 or uri_info.scheme == "file"
if (index := uri_or_path.find("://")) > 0:
scheme = uri_or_path[:index]
return len(scheme) == 1 or scheme == "file"
else:
return True


def create_resource(uri_or_path):
Expand All @@ -80,15 +82,34 @@ def create_resource(uri_or_path):
`FilesystemResource`
The URI resource object, its class depends on the URI.
"""
uri_info = urlparse(uri_or_path, allow_fragments=False)
if uri_info.scheme == "s3":
return AmazonS3Resource(uri_or_path)
elif uri_info.scheme == "gs":
return GoogleCloudStorageResource(uri_or_path)
elif is_local_resource(uri_or_path):
return LocalFilesystemResource(uri_or_path)
# Case where the URI scheme separator `://` is contained in the uri/path
if (index := uri_or_path.find("://")) > 0:
scheme = uri_or_path[:index]

# Case of normal schemes (those whose scheme is not a single char)
# Note: Any 1-char scheme is considered a Windows path
if len(scheme) > 1:
uri_info = urlparse(uri_or_path, allow_fragments=False)
if uri_info.scheme == "s3":
return AmazonS3Resource(uri_or_path)
elif uri_info.scheme == "gs":
return GoogleCloudStorageResource(uri_or_path)
elif scheme == "file":
# Reject URI if authority is not empty
if uri_info.netloc:
raise ValueError(
f"Non-empty 'authority' in local-path URI '{uri_or_path}': "
f"'{uri_info.netloc}'"
)
return LocalFilesystemResource(uri_or_path)
else:
raise ValueError(f"Unsupported URI scheme '{uri_info.scheme}'")
else:
return LocalFilesystemResource(uri_or_path)

# No scheme separator `://` found: Build a local resource
else:
raise ValueError(f"Unsupported URI scheme {uri_info.scheme}")
return LocalFilesystemResource(uri_or_path)


def parent_path(path):
Expand Down Expand Up @@ -411,12 +432,8 @@ def __init__(self, uri):
# Obtain the local from the URI
# Case where the scheme is in fact a windows drive
# => Build the proper path with drive
if (
len(self.uri_info.scheme) == 1
and self.uri_info.scheme.isalpha()
and platform.system() == "Windows"
):
self.path = os.path.join(f"{self.uri_info.scheme}:\\", self.uri_info.path)
if len(self.uri_info.scheme) == 1 and self.uri_info.scheme.isalpha():
self.path = f"{self.uri_info.scheme}:{self.uri_info.path}"
# Case of the "file" scheme
elif self.uri_info.scheme == "file":
# If invalid second colon in path (eg. "/C:/Users"):
Expand Down
96 changes: 29 additions & 67 deletions khiops/core/internals/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import io
import os
import pathlib
import platform
import shlex
import shutil
Expand Down Expand Up @@ -49,21 +50,36 @@ def _isdir_without_all_perms(dir_path):
)


def get_default_samples_dir():
"""Returns the default samples directory

The default samples directory is computed according to the following priorities:
- all systems: ``KHIOPS_SAMPLES_DIR/khiops_data/samples`` if set
- Windows:
- ``%PUBLIC%\\khiops_data\\samples`` if ``%PUBLIC%`` is defined
- ``%USERPROFILE%\\khiops_data\\samples`` otherwise
- Linux/macOS: ``$HOME/khiops_data/samples``
"""
if "KHIOPS_SAMPLES_DIR" in os.environ and os.environ["KHIOPS_SAMPLES_DIR"]:
samples_dir = os.environ["KHIOPS_SAMPLES_DIR"]
elif platform.system() == "Windows" and "PUBLIC" in os.environ:
samples_dir = os.path.join(os.environ["PUBLIC"], "khiops_data", "samples")
else:
samples_dir = str(pathlib.Path.home() / "khiops_data" / "samples")
return samples_dir


def _get_dir_status(a_dir):
"""Returns the status of a local or remote directory

Against a local directory a real check is performed. A remote directory is detected
but not checked.
"""
if fs.is_local_resource(a_dir):
# Remove initial slash on windows systems
# urllib's url2pathname does not work properly
a_dir_res = fs.create_resource(os.path.normpath(a_dir))
a_dir_path = a_dir_res.uri_info.path
if platform.system() == "Windows":
if a_dir_path.startswith("/"):
a_dir_path = a_dir_path[1:]

# a_dir_res is a LocalFilesystemResource already
a_dir_path = a_dir_res.path
if not os.path.exists(a_dir_path):
status = "non-existent"
elif not os.path.isdir(a_dir_path):
Expand Down Expand Up @@ -98,31 +114,6 @@ def _check_samples_dir(samples_dir):
)


def _extract_path_from_uri(uri):
res = fs.create_resource(uri)
if platform.system() == "Windows":
# Case of file:///<LETTER>:/<REST_OF_PATH>:
# Eliminate first slash ("/") from path if the first component
if (
res.uri_info.scheme == ""
and res.uri_info.path[0] == "/"
and res.uri_info.path[1].isalpha()
and res.uri_info.path[2] == ":"
):
path = res.uri_info.path[1:]
# Case of C:/<REST_OF_PATH>:
# Just use the original path
elif len(res.uri_info.scheme) == 1:
path = uri
# Otherwise return URI path as-is
else:
path = res.uri_info.path

else:
path = res.uri_info.path
return path


def _khiops_env_file_exists(env_dir):
"""Check ``khiops_env`` exists relative to the specified environment dir"""
khiops_env_path = os.path.join(env_dir, "khiops_env")
Expand Down Expand Up @@ -399,7 +390,7 @@ def root_temp_dir(self):
def root_temp_dir(self, dir_path):
# Check existence, directory status and permissions for local paths
if fs.is_local_resource(dir_path):
real_dir_path = _extract_path_from_uri(dir_path)
real_dir_path = fs.create_resource(dir_path).path
if os.path.exists(real_dir_path):
if os.path.isfile(real_dir_path):
raise KhiopsEnvironmentError(
Expand Down Expand Up @@ -439,7 +430,7 @@ def create_temp_file(self, prefix, suffix):
# Local resource: Effectively create the file with the python file API
if fs.is_local_resource(self.root_temp_dir):
# Extract the path from the potential URI
root_temp_dir_path = _extract_path_from_uri(self.root_temp_dir)
root_temp_dir_path = fs.create_resource(self.root_temp_dir).path

# Create the temporary file
tmp_file_fd, tmp_file_path = tempfile.mkstemp(
Expand Down Expand Up @@ -470,7 +461,7 @@ def create_temp_dir(self, prefix):
"""
# Local resource: Effectively create the directory with the python file API
if fs.is_local_resource(self.root_temp_dir):
root_temp_dir_path = _extract_path_from_uri(self.root_temp_dir)
root_temp_dir_path = fs.create_resource(self.root_temp_dir).path
temp_dir = tempfile.mkdtemp(prefix=prefix, dir=root_temp_dir_path)
# Remote resource: Just return a highly probable unique path
else:
Expand Down Expand Up @@ -919,7 +910,7 @@ class KhiopsLocalRunner(KhiopsRunner):

- Windows:

- ``%PUBLIC%\khiops_data\samples%`` if it exists and is a directory
- ``%PUBLIC%\khiops_data\samples%`` if ``%PUBLIC%`` is defined
- ``%USERPROFILE%\khiops_data\samples%`` otherwise

- Linux and macOS:
Expand Down Expand Up @@ -1029,38 +1020,9 @@ def _initialize_khiops_environment(self):

def _initialize_default_samples_dir(self):
"""See class docstring"""
# Set the fallback value for the samples directory
home_samples_dir = Path.home() / "khiops_data" / "samples"

# Take the value of an environment variable in priority
if "KHIOPS_SAMPLES_DIR" in os.environ:
self._samples_dir = os.environ["KHIOPS_SAMPLES_DIR"]

# The samples location of Windows systems is:
# - %PUBLIC%\khiops_data\samples if %PUBLIC% exists
# - %USERPROFILE%\khiops_data\samples otherwise
elif platform.system() == "Windows":
if "PUBLIC" in os.environ:
public_samples_dir = os.path.join(
os.environ["PUBLIC"], "khiops_data", "samples"
)
else:
public_samples_dir = None

ok_statuses = ["ok", "remote"]
if (
public_samples_dir is not None
and _get_dir_status(public_samples_dir) in ok_statuses
):
self._samples_dir = public_samples_dir
else:
self._samples_dir = str(home_samples_dir)

# The default samples location on Unix systems is:
# $HOME/khiops/samples on Linux and Mac OS
else:
self._samples_dir = str(home_samples_dir)

samples_dir = get_default_samples_dir()
_check_samples_dir(samples_dir)
self._samples_dir = samples_dir
assert self._samples_dir is not None

def _check_tools(self):
Expand Down
15 changes: 9 additions & 6 deletions khiops/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import zipfile

import khiops.core as kh
from khiops.core.internals.runner import get_default_samples_dir
from khiops.samples import samples as samples_core

# We deactivate the warnings to not show a deprecation warning from sklearn
Expand Down Expand Up @@ -115,7 +116,11 @@ def download_datasets(
"""Downloads the Khiops sample datasets for a given version

The datasets are downloaded to:
- Windows: ``%USERPROFILE%\\khiops_data\\samples``
- all systems: ``KHIOPS_SAMPLES_DIR/khiops_data/samples`` if
``KHIOPS_SAMPLES_DIR`` is defined and non-empty
- Windows:
- ``%PUBLIC%\\khiops_data\\samples`` if ``%PUBLIC%`` is defined
- ``%USERPROFILE%\\khiops_data\\samples`` otherwise
- Linux/macOS: ``$HOME/khiops_data/samples``

Parameters
Expand All @@ -126,10 +131,8 @@ def download_datasets(
The version of the samples datasets.
"""
# Note: The hidden parameter _called_from_shell is just to change the user messages.

# Check if the home sample dataset location is available and build it if necessary
samples_dir = pathlib.Path.home() / "khiops_data" / "samples"
if samples_dir.exists() and not force_overwrite:
samples_dir = get_default_samples_dir()
if os.path.exists(samples_dir) and not force_overwrite:
if _called_from_shell:
instructions = "Execute with '--force-overwrite' to overwrite it"
else:
Expand All @@ -140,7 +143,7 @@ def download_datasets(
)
else:
# Create the samples dataset directory
if samples_dir.exists():
if os.path.exists(samples_dir):
shutil.rmtree(samples_dir)
os.makedirs(samples_dir, exist_ok=True)

Expand Down
1 change: 1 addition & 0 deletions tests/test_dataset_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or #
# see the "LICENSE.md" file for more details. #
######################################################################################
"""Test the expected behavior of the Dataset class"""
import os
import shutil
import unittest
Expand Down
Loading