@@ -77,12 +77,11 @@ def return_df_in_original_format(df, received_ID_col=False, received_single_time
7777 pd.Dataframe
7878 original input format
7979 """
80- new_df = df .copy (deep = True )
8180 if not received_ID_col and received_single_time_series :
82- assert len (new_df ["ID" ].unique ()) == 1
83- new_df .drop ("ID" , axis = 1 , inplace = True )
81+ assert len (df ["ID" ].unique ()) == 1
82+ df .drop ("ID" , axis = 1 , inplace = True )
8483 log .info ("Returning df with no ID column" )
85- return new_df
84+ return df
8685
8786
8887def merge_dataframes (df : pd .DataFrame ) -> pd .DataFrame :
@@ -102,7 +101,7 @@ def merge_dataframes(df: pd.DataFrame) -> pd.DataFrame:
102101 raise ValueError ("Can not join other than pd.DataFrames" )
103102 if "ID" not in df .columns :
104103 raise ValueError ("df does not contain 'ID' column" )
105- df_merged = df .copy ( deep = True ). drop ("ID" , axis = 1 )
104+ df_merged = df .drop ("ID" , axis = 1 )
106105 df_merged = df_merged .sort_values ("ds" )
107106 df_merged = df_merged .drop_duplicates (subset = ["ds" ])
108107 df_merged = df_merged .reset_index (drop = True )
@@ -282,11 +281,8 @@ def init_data_params(
282281 ShiftScale entries containing ``shift`` and ``scale`` parameters for each column
283282 """
284283 # Compute Global data params
285- # df = df.copy(deep=True)
286- # df, _, _, _ = check_multiple_series_id(df)
287- df_merged = df .copy (deep = True ).drop ("ID" , axis = 1 )
288284 global_data_params = data_params_definition (
289- df_merged , normalize , config_lagged_regressors , config_regressors , config_events , config_seasonality
285+ df , normalize , config_lagged_regressors , config_regressors , config_events , config_seasonality
290286 )
291287 if global_normalization :
292288 log .debug (
@@ -296,7 +292,6 @@ def init_data_params(
296292 local_data_params = OrderedDict ()
297293 local_run_despite_global = True if global_normalization else None
298294 for df_name , df_i in df .groupby ("ID" ):
299- df_i .drop ("ID" , axis = 1 , inplace = True )
300295 local_data_params [df_name ] = data_params_definition (
301296 df = df_i ,
302297 normalize = normalize ,
@@ -378,7 +373,6 @@ def normalize(df, data_params):
378373 pd.DataFrame
379374 normalized dataframes
380375 """
381- df = df .copy (deep = True )
382376 for name in df .columns :
383377 if name == "ID" :
384378 continue
@@ -428,8 +422,7 @@ def check_dataframe(
428422 pd.DataFrame or dict
429423 checked dataframe
430424 """
431- # df = df.copy(deep=True)
432- # df, _, _, _ = check_multiple_series_id(df)
425+ # TODO: move call to check_multiple_series_id here
433426 if df .groupby ("ID" ).size ().min () < 1 :
434427 raise ValueError ("Dataframe has no rows." )
435428 if "ds" not in df :
@@ -542,7 +535,7 @@ def _crossvalidation_split_df(df, n_lags, n_forecasts, k, fold_pct, fold_overlap
542535 min_train = total_samples - samples_fold - (k - 1 ) * (samples_fold - samples_overlap )
543536 assert min_train >= samples_fold
544537 folds = []
545- df_fold = df . copy ( deep = True )
538+ df_fold = df
546539 for i in range (k , 0 , - 1 ):
547540 df_train , df_val = split_df (df_fold , n_lags , n_forecasts , valid_p = samples_fold , inputs_overbleed = True )
548541 folds .append ((df_train , df_val ))
@@ -635,33 +628,30 @@ def _crossvalidation_with_time_threshold(df, n_lags, n_forecasts, k, fold_pct, f
635628
636629 validation data
637630 """
638- df_merged = merge_dataframes (df )
631+ df_merged = merge_dataframes (df . copy ( deep = True ) )
639632 total_samples = len (df_merged ) - n_lags + 2 - (2 * n_forecasts )
640633 samples_fold = max (1 , int (fold_pct * total_samples ))
641634 samples_overlap = int (fold_overlap_pct * samples_fold )
642635 assert samples_overlap < samples_fold
643636 min_train = total_samples - samples_fold - (k - 1 ) * (samples_fold - samples_overlap )
644637 assert min_train >= samples_fold
645638 folds = []
646- df_fold = df
647- # df_fold = df.copy(deep=True)
648- # df_fold, _, _, _ = check_multiple_series_id(df_fold)
649639 for i in range (k , 0 , - 1 ):
650- threshold_time_stamp = find_time_threshold (df_fold , n_lags , n_forecasts , samples_fold , inputs_overbleed = True )
640+ threshold_time_stamp = find_time_threshold (df , n_lags , n_forecasts , samples_fold , inputs_overbleed = True )
651641 df_train , df_val = split_considering_timestamp (
652- df_fold , n_lags , n_forecasts , inputs_overbleed = True , threshold_time_stamp = threshold_time_stamp
642+ df , n_lags , n_forecasts , inputs_overbleed = True , threshold_time_stamp = threshold_time_stamp
653643 )
654644 folds .append ((df_train , df_val ))
655645 split_idx = len (df_merged ) - samples_fold + samples_overlap
656646 df_merged = df_merged [:split_idx ].reset_index (drop = True )
657647 threshold_time_stamp = df_merged ["ds" ].iloc [- 1 ]
658648 df_fold_aux = pd .DataFrame ()
659- for df_name , df_i in df_fold .groupby ("ID" ):
660- df_aux = (
661- df_i .copy (deep = True ).iloc [: len (df_i [df_i ["ds" ] < threshold_time_stamp ]) + 1 ].reset_index (drop = True )
662- )
649+ for df_name , df_i in df .groupby ("ID" ):
650+ # df_i = df_i.copy(deep=True)
651+ df_aux = df_i .iloc [: len (df_i [df_i ["ds" ] < threshold_time_stamp ]) + 1 ].reset_index (drop = True )
663652 df_fold_aux = pd .concat ((df_fold_aux , df_aux ), ignore_index = True )
664- df_fold = df_fold_aux .copy (deep = True )
653+ df = df_fold_aux
654+ # df = df.copy(deep=True)
665655 folds = folds [::- 1 ]
666656 return folds
667657
@@ -707,7 +697,6 @@ def crossvalidation_split_df(
707697
708698 validation data
709699 """
710- # df = df.copy(deep=True)
711700 df , _ , _ , _ = check_multiple_series_id (df )
712701 folds = []
713702 if len (df ["ID" ].unique ()) == 1 :
@@ -733,7 +722,7 @@ def crossvalidation_split_df(
733722 start_date , end_date = find_valid_time_interval_for_cv (df )
734723 for df_name , df_i in df .groupby ("ID" ):
735724 mask = (df_i ["ds" ] >= start_date ) & (df_i ["ds" ] <= end_date )
736- df_i = df_i [mask ]. copy ( deep = True )
725+ df_i = df_i [mask ]
737726 folds_dict [df_name ] = _crossvalidation_split_df (
738727 df_i , n_lags , n_forecasts , k , fold_pct , fold_overlap_pct
739728 )
@@ -768,8 +757,6 @@ def double_crossvalidation_split_df(df, n_lags, n_forecasts, k, valid_pct, test_
768757 tuple of k tuples [(folds_val, folds_test), …]
769758 elements same as :meth:`crossvalidation_split_df` returns
770759 """
771- # df = df.copy(deep=True)
772- # df, _, _, _ = check_multiple_series_id(df)
773760 if len (df ["ID" ].unique ()) > 1 :
774761 raise NotImplementedError ("double_crossvalidation_split_df not implemented for df with many time series" )
775762 fold_pct_test = float (test_pct ) / k
@@ -800,7 +787,7 @@ def find_time_threshold(df, n_lags, n_forecasts, valid_p, inputs_overbleed):
800787 str
801788 time stamp threshold defines the boundary for the train and validation sets split.
802789 """
803- df_merged = merge_dataframes (df )
790+ df_merged = merge_dataframes (df . copy ( deep = True ) )
804791 n_samples = len (df_merged ) - n_lags + 2 - (2 * n_forecasts )
805792 n_samples = n_samples if inputs_overbleed else n_samples - n_lags
806793 if 0.0 < valid_p < 1.0 :
@@ -842,11 +829,14 @@ def split_considering_timestamp(df, n_lags, n_forecasts, inputs_overbleed, thres
842829 df_val = pd .DataFrame ()
843830 for df_name , df_i in df .groupby ("ID" ):
844831 if df [df ["ID" ] == df_name ]["ds" ].max () < threshold_time_stamp :
845- df_train = pd .concat ((df_train , df_i .copy (deep = True )), ignore_index = True )
832+ # df_i = df_i.copy(deep=True)
833+ df_train = pd .concat ((df_train , df_i ), ignore_index = True )
846834 elif df [df ["ID" ] == df_name ]["ds" ].min () > threshold_time_stamp :
847- df_val = pd .concat ((df_val , df_i .copy (deep = True )), ignore_index = True )
835+ # df_i = df_i.copy(deep=True)
836+ df_val = pd .concat ((df_val , df_i ), ignore_index = True )
848837 else :
849- df_aux = df_i .copy (deep = True )
838+ df_aux = df_i
839+ # df_i = df_i.copy(deep=True)
850840 n_train = len (df_aux [df_aux ["ds" ] < threshold_time_stamp ])
851841 split_idx_train = n_train + n_lags + n_forecasts - 1
852842 split_idx_val = split_idx_train - n_lags if inputs_overbleed else split_idx_train
@@ -890,8 +880,6 @@ def split_df(
890880 pd.DataFrame, dict
891881 validation data
892882 """
893- # df = df.copy(deep=True)
894- # df, _, _, _ = check_multiple_series_id(df)
895883 df_train = pd .DataFrame ()
896884 df_val = pd .DataFrame ()
897885 if local_split :
@@ -1373,8 +1361,6 @@ def infer_frequency(df, freq, n_lags, min_freq_percentage=0.7):
13731361 Valid frequency tag according to major frequency.
13741362
13751363 """
1376- # df = df.copy(deep=True)
1377- # df, _, _, _ = check_multiple_series_id(df)
13781364 freq_df = list ()
13791365 for df_name , df_i in df .groupby ("ID" ):
13801366 freq_df .append (_infer_frequency (df_i , freq , min_freq_percentage ))
@@ -1396,6 +1382,7 @@ def create_dict_for_events_or_regressors(
13961382 df : pd .DataFrame ,
13971383 other_df : Optional [pd .DataFrame ],
13981384 other_df_name : str ,
1385+ received_ID_col : bool ,
13991386) -> dict : # Not sure about the naming of this function
14001387 """Create a dict for events or regressors according to input df.
14011388
@@ -1417,12 +1404,10 @@ def create_dict_for_events_or_regressors(
14171404 if other_df is None :
14181405 # if other_df is None, create dictionary with None for each ID
14191406 return {df_name : None for df_name in df_names }
1420- other_df = other_df .copy (deep = True )
1421- other_df , received_ID_col , _ , _ = check_multiple_series_id (other_df )
14221407 # if other_df does not contain ID, create dictionary with original ID with the same other_df for each ID
14231408 if not received_ID_col :
14241409 other_df = other_df .drop ("ID" , axis = 1 )
1425- return {df_name : other_df . copy ( deep = True ) for df_name in df_names }
1410+ return {df_name : other_df for df_name in df_names }
14261411
14271412 # else, other_df does contain ID, create dict with respective IDs
14281413 df_unique_names , other_df_unique_names = list (df ["ID" ].unique ()), list (other_df ["ID" ].unique ())
@@ -1438,7 +1423,7 @@ def create_dict_for_events_or_regressors(
14381423 df_other_dict = {}
14391424 for df_name in df_unique_names :
14401425 if df_name in other_df_unique_names :
1441- df_aux = other_df [other_df ["ID" ] == df_name ].reset_index (drop = True ). copy ( deep = True )
1426+ df_aux = other_df [other_df ["ID" ] == df_name ].reset_index (drop = True )
14421427 df_aux .drop ("ID" , axis = 1 , inplace = True )
14431428 else :
14441429 df_aux = None
0 commit comments