diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b6034ca..9064cefd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ - (`core`) Dictionary API support for dictionary, variable and variable block comments, and dictionary and variable block internal comments. +### Fixed +- (General) Inconsistency between the `tools.download_datasets` function and the + current samples directory according to `core.api.get_samples_dir()`. + + ## 11.0.0.0-b.0 - 2025-07-10 ### Added diff --git a/khiops/core/internals/filesystems.py b/khiops/core/internals/filesystems.py index ac3ca923..bcc7c590 100644 --- a/khiops/core/internals/filesystems.py +++ b/khiops/core/internals/filesystems.py @@ -8,7 +8,6 @@ import json import os -import platform import shutil import warnings from abc import ABC, abstractmethod @@ -59,8 +58,11 @@ def is_local_resource(uri_or_path): `bool` `True` if a URI refers to a local path """ - uri_info = urlparse(uri_or_path, allow_fragments=False) - return len(uri_info.scheme) <= 1 or uri_info.scheme == "file" + if (index := uri_or_path.find("://")) > 0: + scheme = uri_or_path[:index] + return len(scheme) == 1 or scheme == "file" + else: + return True def create_resource(uri_or_path): @@ -80,15 +82,34 @@ def create_resource(uri_or_path): `FilesystemResource` The URI resource object, its class depends on the URI. """ - uri_info = urlparse(uri_or_path, allow_fragments=False) - if uri_info.scheme == "s3": - return AmazonS3Resource(uri_or_path) - elif uri_info.scheme == "gs": - return GoogleCloudStorageResource(uri_or_path) - elif is_local_resource(uri_or_path): - return LocalFilesystemResource(uri_or_path) + # Case where the URI scheme separator `://` is contained in the uri/path + if (index := uri_or_path.find("://")) > 0: + scheme = uri_or_path[:index] + + # Case of normal schemes (those whose scheme is not a single char) + # Note: Any 1-char scheme is considered a Windows path + if len(scheme) > 1: + uri_info = urlparse(uri_or_path, allow_fragments=False) + if uri_info.scheme == "s3": + return AmazonS3Resource(uri_or_path) + elif uri_info.scheme == "gs": + return GoogleCloudStorageResource(uri_or_path) + elif scheme == "file": + # Reject URI if authority is not empty + if uri_info.netloc: + raise ValueError( + f"Non-empty 'authority' in local-path URI '{uri_or_path}': " + f"'{uri_info.netloc}'" + ) + return LocalFilesystemResource(uri_or_path) + else: + raise ValueError(f"Unsupported URI scheme '{uri_info.scheme}'") + else: + return LocalFilesystemResource(uri_or_path) + + # No scheme separator `://` found: Build a local resource else: - raise ValueError(f"Unsupported URI scheme {uri_info.scheme}") + return LocalFilesystemResource(uri_or_path) def parent_path(path): @@ -411,12 +432,8 @@ def __init__(self, uri): # Obtain the local from the URI # Case where the scheme is in fact a windows drive # => Build the proper path with drive - if ( - len(self.uri_info.scheme) == 1 - and self.uri_info.scheme.isalpha() - and platform.system() == "Windows" - ): - self.path = os.path.join(f"{self.uri_info.scheme}:\\", self.uri_info.path) + if len(self.uri_info.scheme) == 1 and self.uri_info.scheme.isalpha(): + self.path = f"{self.uri_info.scheme}:{self.uri_info.path}" # Case of the "file" scheme elif self.uri_info.scheme == "file": # If invalid second colon in path (eg. "/C:/Users"): diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index d60dfb3f..7cbb82aa 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -14,6 +14,7 @@ import io import os +import pathlib import platform import shlex import shutil @@ -49,6 +50,25 @@ def _isdir_without_all_perms(dir_path): ) +def get_default_samples_dir(): + """Returns the default samples directory + + The default samples directory is computed according to the following priorities: + - all systems: ``KHIOPS_SAMPLES_DIR/khiops_data/samples`` if set + - Windows: + - ``%PUBLIC%\\khiops_data\\samples`` if ``%PUBLIC%`` is defined + - ``%USERPROFILE%\\khiops_data\\samples`` otherwise + - Linux/macOS: ``$HOME/khiops_data/samples`` + """ + if "KHIOPS_SAMPLES_DIR" in os.environ and os.environ["KHIOPS_SAMPLES_DIR"]: + samples_dir = os.environ["KHIOPS_SAMPLES_DIR"] + elif platform.system() == "Windows" and "PUBLIC" in os.environ: + samples_dir = os.path.join(os.environ["PUBLIC"], "khiops_data", "samples") + else: + samples_dir = str(pathlib.Path.home() / "khiops_data" / "samples") + return samples_dir + + def _get_dir_status(a_dir): """Returns the status of a local or remote directory @@ -56,14 +76,10 @@ def _get_dir_status(a_dir): but not checked. """ if fs.is_local_resource(a_dir): - # Remove initial slash on windows systems - # urllib's url2pathname does not work properly a_dir_res = fs.create_resource(os.path.normpath(a_dir)) - a_dir_path = a_dir_res.uri_info.path - if platform.system() == "Windows": - if a_dir_path.startswith("/"): - a_dir_path = a_dir_path[1:] + # a_dir_res is a LocalFilesystemResource already + a_dir_path = a_dir_res.path if not os.path.exists(a_dir_path): status = "non-existent" elif not os.path.isdir(a_dir_path): @@ -98,31 +114,6 @@ def _check_samples_dir(samples_dir): ) -def _extract_path_from_uri(uri): - res = fs.create_resource(uri) - if platform.system() == "Windows": - # Case of file:///:/: - # Eliminate first slash ("/") from path if the first component - if ( - res.uri_info.scheme == "" - and res.uri_info.path[0] == "/" - and res.uri_info.path[1].isalpha() - and res.uri_info.path[2] == ":" - ): - path = res.uri_info.path[1:] - # Case of C:/: - # Just use the original path - elif len(res.uri_info.scheme) == 1: - path = uri - # Otherwise return URI path as-is - else: - path = res.uri_info.path - - else: - path = res.uri_info.path - return path - - def _khiops_env_file_exists(env_dir): """Check ``khiops_env`` exists relative to the specified environment dir""" khiops_env_path = os.path.join(env_dir, "khiops_env") @@ -399,7 +390,7 @@ def root_temp_dir(self): def root_temp_dir(self, dir_path): # Check existence, directory status and permissions for local paths if fs.is_local_resource(dir_path): - real_dir_path = _extract_path_from_uri(dir_path) + real_dir_path = fs.create_resource(dir_path).path if os.path.exists(real_dir_path): if os.path.isfile(real_dir_path): raise KhiopsEnvironmentError( @@ -439,7 +430,7 @@ def create_temp_file(self, prefix, suffix): # Local resource: Effectively create the file with the python file API if fs.is_local_resource(self.root_temp_dir): # Extract the path from the potential URI - root_temp_dir_path = _extract_path_from_uri(self.root_temp_dir) + root_temp_dir_path = fs.create_resource(self.root_temp_dir).path # Create the temporary file tmp_file_fd, tmp_file_path = tempfile.mkstemp( @@ -470,7 +461,7 @@ def create_temp_dir(self, prefix): """ # Local resource: Effectively create the directory with the python file API if fs.is_local_resource(self.root_temp_dir): - root_temp_dir_path = _extract_path_from_uri(self.root_temp_dir) + root_temp_dir_path = fs.create_resource(self.root_temp_dir).path temp_dir = tempfile.mkdtemp(prefix=prefix, dir=root_temp_dir_path) # Remote resource: Just return a highly probable unique path else: @@ -919,7 +910,7 @@ class KhiopsLocalRunner(KhiopsRunner): - Windows: - - ``%PUBLIC%\khiops_data\samples%`` if it exists and is a directory + - ``%PUBLIC%\khiops_data\samples%`` if ``%PUBLIC%`` is defined - ``%USERPROFILE%\khiops_data\samples%`` otherwise - Linux and macOS: @@ -1029,38 +1020,9 @@ def _initialize_khiops_environment(self): def _initialize_default_samples_dir(self): """See class docstring""" - # Set the fallback value for the samples directory - home_samples_dir = Path.home() / "khiops_data" / "samples" - - # Take the value of an environment variable in priority - if "KHIOPS_SAMPLES_DIR" in os.environ: - self._samples_dir = os.environ["KHIOPS_SAMPLES_DIR"] - - # The samples location of Windows systems is: - # - %PUBLIC%\khiops_data\samples if %PUBLIC% exists - # - %USERPROFILE%\khiops_data\samples otherwise - elif platform.system() == "Windows": - if "PUBLIC" in os.environ: - public_samples_dir = os.path.join( - os.environ["PUBLIC"], "khiops_data", "samples" - ) - else: - public_samples_dir = None - - ok_statuses = ["ok", "remote"] - if ( - public_samples_dir is not None - and _get_dir_status(public_samples_dir) in ok_statuses - ): - self._samples_dir = public_samples_dir - else: - self._samples_dir = str(home_samples_dir) - - # The default samples location on Unix systems is: - # $HOME/khiops/samples on Linux and Mac OS - else: - self._samples_dir = str(home_samples_dir) - + samples_dir = get_default_samples_dir() + _check_samples_dir(samples_dir) + self._samples_dir = samples_dir assert self._samples_dir is not None def _check_tools(self): diff --git a/khiops/tools.py b/khiops/tools.py index e1f1699a..1359a35a 100644 --- a/khiops/tools.py +++ b/khiops/tools.py @@ -21,6 +21,7 @@ import zipfile import khiops.core as kh +from khiops.core.internals.runner import get_default_samples_dir from khiops.samples import samples as samples_core # We deactivate the warnings to not show a deprecation warning from sklearn @@ -115,7 +116,11 @@ def download_datasets( """Downloads the Khiops sample datasets for a given version The datasets are downloaded to: - - Windows: ``%USERPROFILE%\\khiops_data\\samples`` + - all systems: ``KHIOPS_SAMPLES_DIR/khiops_data/samples`` if + ``KHIOPS_SAMPLES_DIR`` is defined and non-empty + - Windows: + - ``%PUBLIC%\\khiops_data\\samples`` if ``%PUBLIC%`` is defined + - ``%USERPROFILE%\\khiops_data\\samples`` otherwise - Linux/macOS: ``$HOME/khiops_data/samples`` Parameters @@ -126,10 +131,8 @@ def download_datasets( The version of the samples datasets. """ # Note: The hidden parameter _called_from_shell is just to change the user messages. - - # Check if the home sample dataset location is available and build it if necessary - samples_dir = pathlib.Path.home() / "khiops_data" / "samples" - if samples_dir.exists() and not force_overwrite: + samples_dir = get_default_samples_dir() + if os.path.exists(samples_dir) and not force_overwrite: if _called_from_shell: instructions = "Execute with '--force-overwrite' to overwrite it" else: @@ -140,7 +143,7 @@ def download_datasets( ) else: # Create the samples dataset directory - if samples_dir.exists(): + if os.path.exists(samples_dir): shutil.rmtree(samples_dir) os.makedirs(samples_dir, exist_ok=True) diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index a63167cf..23753852 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -4,6 +4,7 @@ # which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or # # see the "LICENSE.md" file for more details. # ###################################################################################### +"""Test the expected behavior of the Dataset class""" import os import shutil import unittest diff --git a/tests/test_khiops_integrations.py b/tests/test_khiops_integrations.py index 15b765dc..7c8f4bc4 100644 --- a/tests/test_khiops_integrations.py +++ b/tests/test_khiops_integrations.py @@ -17,6 +17,7 @@ import khiops.core as kh import khiops.core.internals.filesystems as fs +from khiops import tools from khiops.core.exceptions import KhiopsEnvironmentError from khiops.core.internals.runner import KhiopsLocalRunner from khiops.extras.docker import KhiopsDockerRunner @@ -30,6 +31,118 @@ class KhiopsRunnerEnvironmentTests(unittest.TestCase): """Test that runners in different environments work""" + @unittest.skipIf( + os.environ.get("SKIP_EXPENSIVE_TESTS", "false").lower() == "true", + "Skipping expensive test", + ) + def test_samples_are_downloaded_according_to_the_runner_setting(self): + """Test that samples are downloaded to the runner samples directory""" + + # Save initial state + initial_runner = kh.get_runner() + initial_khiops_samples_dir = os.environ.get("KHIOPS_SAMPLES_DIR") + + # Test that default samples download location is consistent with the + # KhiopsLocalRunner samples directory + with tempfile.TemporaryDirectory() as tmp_samples_dir: + + # Set environment variable to the temporary samples dir + os.environ["KHIOPS_SAMPLES_DIR"] = tmp_samples_dir + + # Create test runner to update samples dir to tmp_samples_dir, + # according to the newly-set environment variable + test_runner = KhiopsLocalRunner() + + # Set current runner to the test runner + kh.set_runner(test_runner) + + # Check that samples are not in tmp_samples_dir + with self.assertRaises(AssertionError): + self.assert_samples_dir_integrity(tmp_samples_dir) + + # Download samples into existing, but empty, tmp_samples_dir + # Check that samples have been downloaded to tmp_samples_dir + tools.download_datasets(force_overwrite=True) + self.assert_samples_dir_integrity(tmp_samples_dir) + + # Remove KHIOPS_SAMPLES_DIR + # Create test runner to update samples dir to the default runner samples + # dir, following the deletion of the KHIOPS_SAMPLES_DIR environment + # variable + # Set current runner to the test runner + del os.environ["KHIOPS_SAMPLES_DIR"] + test_runner = KhiopsLocalRunner() + kh.set_runner(test_runner) + + # Get the default runner samples dir + default_runner_samples_dir = kh.get_samples_dir() + + # Move existing default runner samples dir contents to temporary directory + if os.path.isdir(default_runner_samples_dir): + tmp_initial_samples_dir = tempfile.mkdtemp() + shutil.copytree( + default_runner_samples_dir, + tmp_initial_samples_dir, + dirs_exist_ok=True, + ) + shutil.rmtree(default_runner_samples_dir) + else: + tmp_initial_samples_dir = None + + # Check that the samples are not present in the default runner + # samples dir + with self.assertRaises(AssertionError): + self.assert_samples_dir_integrity(default_runner_samples_dir) + + # Download datasets to the default runner samples dir (which + # should be created on this occasion) + # Default samples dir does not exist anymore + # Check that the default samples dir is populated + tools.download_datasets() + self.assert_samples_dir_integrity(default_runner_samples_dir) + + # Clean-up default samples dir + shutil.rmtree(default_runner_samples_dir) + + # Restore initial state: + # - initial samples dir contents if previously present + # - initial KHIOPS_SAMPLES_DIR if set + # - initial runner + if tmp_initial_samples_dir is not None and os.path.isdir( + tmp_initial_samples_dir + ): + shutil.copytree( + tmp_initial_samples_dir, + default_runner_samples_dir, + dirs_exist_ok=True, + ) + + # Remove temporary directory + shutil.rmtree(tmp_initial_samples_dir) + if initial_khiops_samples_dir is not None: + os.environ["KHIOPS_SAMPLES_DIR"] = initial_khiops_samples_dir + kh.set_runner(initial_runner) + + def assert_samples_dir_integrity(self, samples_dir): + """Checks that the samples dir has the expected structure""" + expected_dataset_names = [ + "Accidents", + "AccidentsSummary", + "Adult", + "Customer", + "CustomerExtended", + "Iris", + "Letter", + "Mushroom", + "NegativeAirlineTweets", + "SpliceJunction", + "Vehicle", + "WineReviews", + ] + self.assertTrue(os.path.isdir(samples_dir)) + for ds_name in expected_dataset_names: + self.assertIn(ds_name, os.listdir(samples_dir)) + @unittest.skipIf( platform.system() != "Linux", "Skipping test for non-Linux platform" )