Skip to content

Commit a01c98d

Browse files
authored
Merge pull request #5819 from PrimozGodec/datetime-format-selection
[ENH] Datetime format selection
2 parents 269d7fa + 55352c2 commit a01c98d

File tree

4 files changed

+306
-97
lines changed

4 files changed

+306
-97
lines changed

Orange/data/tests/test_variable.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from io import StringIO
1414

1515
import numpy as np
16+
import pandas as pd
1617
import scipy.sparse as sp
1718

1819
from Orange.data import Variable, ContinuousVariable, DiscreteVariable, \
@@ -698,6 +699,117 @@ def varcls_modified(self, name):
698699
var.have_time = 1
699700
return var
700701

702+
def test_additional_formats(self):
703+
expected_date = datetime(2022, 2, 7)
704+
dates = {
705+
"2021-11-25": ("2022-02-07",),
706+
"25.11.2021": ("07.02.2022", "07. 02. 2022", "7.2.2022", "7. 2. 2022"),
707+
"25.11.21": ("07.02.22", "07. 02. 22", "7.2.22", "7. 2. 22"),
708+
"11/25/2021": ("02/07/2022", "2/7/2022"),
709+
"11/25/21": ("02/07/22", "2/7/22"),
710+
"20211125": ("20220207",),
711+
}
712+
expected_date_time = datetime(2022, 2, 7, 10, 11, 12)
713+
date_times = {
714+
"2021-11-25 00:00:00": (
715+
"2022-02-07 10:11:12",
716+
"2022-02-07 10:11:12.00",
717+
),
718+
"25.11.2021 00:00:00": (
719+
"07.02.2022 10:11:12",
720+
"07. 02. 2022 10:11:12",
721+
"7.2.2022 10:11:12",
722+
"7. 2. 2022 10:11:12",
723+
"07.02.2022 10:11:12.00",
724+
"07. 02. 2022 10:11:12.00",
725+
"7.2.2022 10:11:12.00",
726+
"7. 2. 2022 10:11:12.00",
727+
),
728+
"25.11.21 00:00:00": (
729+
"07.02.22 10:11:12",
730+
"07. 02. 22 10:11:12",
731+
"7.2.22 10:11:12",
732+
"7. 2. 22 10:11:12",
733+
"07.02.22 10:11:12.00",
734+
"07. 02. 22 10:11:12.00",
735+
"7.2.22 10:11:12.00",
736+
"7. 2. 22 10:11:12.00",
737+
),
738+
"11/25/2021 00:00:00": (
739+
"02/07/2022 10:11:12",
740+
"2/7/2022 10:11:12",
741+
"02/07/2022 10:11:12.00",
742+
"2/7/2022 10:11:12.00",
743+
),
744+
"11/25/21 00:00:00": (
745+
"02/07/22 10:11:12",
746+
"2/7/22 10:11:12",
747+
"02/07/22 10:11:12.00",
748+
"2/7/22 10:11:12.00",
749+
),
750+
"20211125000000": ("20220207101112", "20220207101112.00"),
751+
}
752+
# times without seconds
753+
expected_date_time2 = datetime(2022, 2, 7, 10, 11, 0)
754+
date_times2 = {
755+
"2021-11-25 00:00:00": ("2022-02-07 10:11",),
756+
"25.11.2021 00:00:00": (
757+
"07.02.2022 10:11",
758+
"07. 02. 2022 10:11",
759+
"7.2.2022 10:11",
760+
"7. 2. 2022 10:11",
761+
),
762+
"25.11.21 00:00:00": (
763+
"07.02.22 10:11",
764+
"07. 02. 22 10:11",
765+
"7.2.22 10:11",
766+
"7. 2. 22 10:11",
767+
),
768+
"11/25/2021 00:00:00": ("02/07/2022 10:11", "2/7/2022 10:11"),
769+
"11/25/21 00:00:00": ("02/07/22 10:11", "2/7/22 10:11"),
770+
"20211125000000": ("202202071011",),
771+
}
772+
# datetime defaults to 1900, 01, 01
773+
expected_time = datetime(1900, 1, 1, 10, 11, 12)
774+
times = {
775+
"00:00:00": ("10:11:12", "10:11:12.00"),
776+
"000000": ("101112", "101112.00"),
777+
}
778+
expected_time2 = datetime(1900, 1, 1, 10, 11, 0)
779+
times2 = {
780+
"00:00:00": ("10:11",),
781+
}
782+
expected_year = datetime(2022, 1, 1)
783+
years = {
784+
"2021": (2022,),
785+
}
786+
expected_day = datetime(1900, 2, 7)
787+
days = {
788+
"11-25": ("02-07",),
789+
"25.11.": ("07.02.", "07. 02.", "7.2.", "7. 2."),
790+
"11/25": ("02/07", "2/7"),
791+
}
792+
data = (
793+
(expected_date, dates),
794+
(expected_date_time, date_times),
795+
(expected_date_time2, date_times2),
796+
(expected_time, times),
797+
(expected_time2, times2),
798+
(expected_year, years),
799+
(expected_day, days),
800+
)
801+
for expected, dts in data:
802+
for k, dt in dts.items():
803+
for t in dt:
804+
parsed = [
805+
pd.to_datetime(t, format=f, errors="coerce")
806+
for f in TimeVariable.ADDITIONAL_FORMATS[k][0]
807+
]
808+
# test any equal to expected
809+
self.assertTrue(any(d == expected for d in parsed))
810+
# test that no other equal to any other date - only nan or expected
811+
self.assertTrue(any(d == expected or pd.isnull(d) for d in parsed))
812+
701813

702814
PickleContinuousVariable = create_pickling_tests(
703815
"PickleContinuousVariable",

Orange/data/variable.py

Lines changed: 76 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -870,7 +870,7 @@ class TimeVariable(ContinuousVariable):
870870
871871
If time is specified without a date, Unix epoch is assumed.
872872
873-
If time is specified wihout an UTC offset, localtime is assumed.
873+
If time is specified without an UTC offset, localtime is assumed.
874874
"""
875875
_all_vars = {}
876876
TYPE_HEADERS = ('time', 't')
@@ -923,15 +923,86 @@ class TimeVariable(ContinuousVariable):
923923
r'\d{1,4}(-?\d{2,3})?'
924924
r')$')
925925

926+
ADDITIONAL_FORMATS = {
927+
"2021-11-25": (("%Y-%m-%d",), 1, 0),
928+
"25.11.2021": (("%d.%m.%Y", "%d. %m. %Y"), 1, 0),
929+
"25.11.21": (("%d.%m.%y", "%d. %m. %y"), 1, 0),
930+
"11/25/2021": (("%m/%d/%Y",), 1, 0),
931+
"11/25/21": (("%m/%d/%y",), 1, 0),
932+
"20211125": (("%Y%m%d",), 1, 0),
933+
# it would be too many options if we also include all time formats with
934+
# with lengths up to minutes, up to seconds and up to milliseconds,
935+
# joining all tree options under 00:00:00
936+
"2021-11-25 00:00:00": (
937+
(
938+
"%Y-%m-%d %H:%M",
939+
"%Y-%m-%d %H:%M:%S",
940+
"%Y-%m-%d %H:%M:%S.%f",
941+
),
942+
1,
943+
1,
944+
),
945+
"25.11.2021 00:00:00": (
946+
(
947+
"%d.%m.%Y %H:%M",
948+
"%d. %m. %Y %H:%M",
949+
"%d.%m.%Y %H:%M:%S",
950+
"%d. %m. %Y %H:%M:%S",
951+
"%d.%m.%Y %H:%M:%S.%f",
952+
"%d. %m. %Y %H:%M:%S.%f",
953+
),
954+
1,
955+
1,
956+
),
957+
"25.11.21 00:00:00": (
958+
(
959+
"%d.%m.%y %H:%M",
960+
"%d. %m. %y %H:%M",
961+
"%d.%m.%y %H:%M:%S",
962+
"%d. %m. %y %H:%M:%S",
963+
"%d.%m.%y %H:%M:%S.%f",
964+
"%d. %m. %y %H:%M:%S.%f",
965+
),
966+
1,
967+
1,
968+
),
969+
"11/25/2021 00:00:00": (
970+
(
971+
"%m/%d/%Y %H:%M",
972+
"%m/%d/%Y %H:%M:%S",
973+
"%m/%d/%Y %H:%M:%S.%f",
974+
),
975+
1,
976+
1,
977+
),
978+
"11/25/21 00:00:00": (
979+
(
980+
"%m/%d/%y %H:%M",
981+
"%m/%d/%y %H:%M:%S",
982+
"%m/%d/%y %H:%M:%S.%f",
983+
),
984+
1,
985+
1,
986+
),
987+
"20211125000000": (("%Y%m%d%H%M", "%Y%m%d%H%M%S", "%Y%m%d%H%M%S.%f"), 1, 1),
988+
"00:00:00": (("%H:%M", "%H:%M:%S", "%H:%M:%S.%f"), 0, 1),
989+
"000000": (("%H%M", "%H%M%S", "%H%M%S.%f"), 0, 1),
990+
"2021": (("%Y",), 1, 0),
991+
"11-25": (("%m-%d",), 1, 0),
992+
"25.11.": (("%d.%m.", "%d. %m."), 1, 0),
993+
"11/25": (("%m/%d",), 1, 0),
994+
"1125": (("%m%d",), 1, 0),
995+
}
996+
926997
class InvalidDateTimeFormatError(ValueError):
927998
def __init__(self, date_string):
928999
super().__init__(
929-
"Invalid datetime format '{}'. "
930-
"Only ISO 8601 supported.".format(date_string))
1000+
f"Invalid datetime format '{date_string}'. Only ISO 8601 supported."
1001+
)
9311002

9321003
_matches_iso_format = re.compile(REGEX).match
9331004

934-
# If parsed datetime values provide an offset or timzone, it is used for display.
1005+
# If parsed datetime values provide an offset or timzone, it is used for display.
9351006
# If not all values have the same offset, +0000 (=UTC) timezone is used
9361007
_timezone = None
9371008

@@ -1011,6 +1082,7 @@ def parse(self, datestr):
10111082
"""
10121083
if datestr in MISSING_VALUES:
10131084
return Unknown
1085+
10141086
datestr = datestr.strip().rstrip('Z')
10151087
datestr = self._tzre_sub(datestr)
10161088

0 commit comments

Comments
 (0)