8
8
from typing import Dict , List , Callable , Union , Tuple
9
9
10
10
from ml4cvd .tensor_maps_by_hand import TMAPS
11
- from ml4cvd .defines import ECG_REST_AMP_LEADS , PARTNERS_DATE_FORMAT , STOP_CHAR , PARTNERS_CHAR_2_IDX
11
+ from ml4cvd .defines import ECG_REST_AMP_LEADS , PARTNERS_DATE_FORMAT , STOP_CHAR , PARTNERS_CHAR_2_IDX , PARTNERS_DATETIME_FORMAT
12
12
from ml4cvd .TensorMap import TensorMap , str2date , Interpretation , make_range_validator , decompress_data , TimeSeriesOrder
13
13
14
14
@@ -88,7 +88,7 @@ def get_voltage_from_file(tm, hd5, dependents={}):
88
88
slices = (i , ..., tm .channel_map [cm ]) if dynamic else (..., tm .channel_map [cm ])
89
89
tensor [slices ] = voltage
90
90
except KeyError :
91
- pass
91
+ logging . warning ( f'KeyError for channel { cm } in { tm . name } ' )
92
92
if population_normalize is not None :
93
93
tensor /= population_normalize
94
94
return tensor
@@ -1370,7 +1370,7 @@ def _diagnosis_channels(disease: str, incidence_only: bool = False):
1370
1370
1371
1371
1372
1372
def _outcome_channels (outcome : str ):
1373
- return {f'no_{ outcome } ' : 0 , f'future_ { outcome } ' : 1 }
1373
+ return {f'no_{ outcome } ' : 0 , f'{ outcome } ' : 1 }
1374
1374
1375
1375
1376
1376
def loyalty_time_to_event (
@@ -1603,13 +1603,23 @@ def _cardiac_surgery_str2date(input_date: str) -> datetime.datetime:
1603
1603
return datetime .datetime .strptime (input_date , "%d%b%Y" )
1604
1604
1605
1605
1606
+ def _date_in_window_from_dates (ecg_dates , surgery_date , day_window ):
1607
+ ecg_dates .sort (reverse = True )
1608
+ for ecg_date in ecg_dates :
1609
+ ecg_datetime = datetime .datetime .strptime (ecg_date , PARTNERS_DATETIME_FORMAT )
1610
+ if surgery_date - ecg_datetime <= datetime .timedelta (days = day_window ):
1611
+ return ecg_date
1612
+ raise ValueError (f'No ECG in time window' )
1613
+
1614
+
1606
1615
def build_cardiac_surgery_outcome_tensor_from_file (
1607
1616
file_name : str ,
1617
+ outcome2column : Dict [str , str ],
1608
1618
patient_column : str = "medrecn" ,
1609
- outcome_column : str = "mtopd" ,
1610
1619
start_column : str = "surgdt" ,
1611
1620
delimiter : str = "," ,
1612
1621
day_window : int = 30 ,
1622
+ population_normalize : int = None ,
1613
1623
) -> Callable :
1614
1624
"""Build a tensor_from_file function for outcomes given CSV of patients.
1615
1625
@@ -1618,7 +1628,7 @@ def build_cardiac_surgery_outcome_tensor_from_file(
1618
1628
1619
1629
:param file_name: CSV or TSV file with header of patient IDs (MRNs) dates of enrollment and dates of diagnosis
1620
1630
:param patient_column: The header name of the column of patient ids
1621
- :param diagnosis_date_column: The header name of the column of disease diagnosis dates
1631
+ :param outcome2column: Dictionary mapping outcome names to the header name of the column with outcome status
1622
1632
:param start_column: The header name of the column of surgery dates
1623
1633
:param delimiter: The delimiter separating columns of the TSV or CSV
1624
1634
:return: The tensor_from_file function to provide to TensorMap constructors
@@ -1630,57 +1640,49 @@ def build_cardiac_surgery_outcome_tensor_from_file(
1630
1640
header = next (reader )
1631
1641
patient_index = header .index (patient_column )
1632
1642
date_index = header .index (start_column )
1633
- outcome_index = header .index (outcome_column )
1634
- date_surg_table = {}
1635
- patient_table = {}
1643
+ surgery_date_table = {}
1644
+ outcome_table = defaultdict (dict )
1636
1645
for row in reader :
1637
1646
try :
1638
1647
patient_key = int (row [patient_index ])
1639
- patient_table [patient_key ] = int (row [outcome_index ])
1640
- date_surg_table [patient_key ] = _cardiac_surgery_str2date (
1641
- row [date_index ],
1642
- )
1643
- if len (patient_table ) % 1000 == 0 :
1644
- logging .debug (f"Processed: { len (patient_table )} patient rows." )
1648
+ surgery_date_table [patient_key ] = _cardiac_surgery_str2date (row [date_index ])
1649
+ for outcome in outcome2column :
1650
+ outcome_table [outcome ][patient_key ] = int (row [header .index (outcome2column [outcome ])])
1651
+ if len (outcome_table ) % 1000 == 0 :
1652
+ logging .debug (f"Processed: { len (outcome_table )} outcome rows." )
1645
1653
except ValueError as e :
1646
- logging .warning (f"val err { e } " )
1654
+ logging .debug (f'Value error { e } ' )
1655
+
1656
+ logging .info (f"Processed outcomes:{ list (outcome_table .keys ())} . Got { len (surgery_date_table )} patients." )
1647
1657
1648
1658
except FileNotFoundError as e :
1649
1659
error = e
1650
1660
1651
- logging .info (
1652
- f"Done processing { outcome_column } . Got { len (patient_table )} patient rows." ,
1653
- )
1654
-
1655
1661
def tensor_from_file (tm : TensorMap , hd5 : h5py .File , dependents = None ):
1656
1662
if error :
1657
1663
raise error
1658
1664
1659
- ecg_dates = _get_ecg_dates (tm , hd5 )
1660
- dynamic , shape = _is_dynamic_shape (tm , len (ecg_dates ))
1661
- categorical_data = np .zeros (shape , dtype = np .float32 )
1662
- for i , ecg_date in enumerate (ecg_dates ):
1663
- mrn_int = _hd5_filename_to_mrn_int (hd5 .filename )
1664
- if mrn_int not in patient_table :
1665
- raise KeyError (f"MRN not in STS outcomes CSV" )
1666
-
1667
- path = _make_hd5_path (tm , ecg_date , 'acquisitiondate' )
1668
- ecg_date = _partners_str2date (
1669
- decompress_data (
1670
- data_compressed = hd5 [path ][()],
1671
- dtype = hd5 [path ].attrs ["dtype" ],
1672
- ),
1673
- )
1665
+ mrn_int = _hd5_filename_to_mrn_int (hd5 .filename )
1666
+ if mrn_int not in surgery_date_table :
1667
+ raise KeyError (f"MRN not in STS outcomes CSV" )
1674
1668
1675
- # Convert ecg_date from datetime.date to datetime.datetime
1676
- ecg_date = datetime .datetime .combine (ecg_date , datetime .time .min )
1669
+ ecg_dates = list (hd5 [tm .path_prefix ])
1670
+ tensor = np .zeros (tm .shape , dtype = np .float32 )
1671
+ for dtm in tm .dependent_map :
1672
+ dependents [tm .dependent_map [dtm ]] = np .zeros (tm .dependent_map [dtm ].shape , dtype = np .float32 )
1673
+ ecg_date = _date_in_window_from_dates (ecg_dates , surgery_date_table [mrn_int ], day_window )
1674
+ for cm in tm .channel_map :
1675
+ path = _make_hd5_path (tm , ecg_date , cm )
1676
+ voltage = decompress_data (data_compressed = hd5 [path ][()], dtype = hd5 [path ].attrs ['dtype' ])
1677
+ voltage = _resample_voltage (voltage , tm .shape [0 ])
1678
+ tensor [..., tm .channel_map [cm ]] = voltage
1679
+ if population_normalize is not None :
1680
+ tensor /= population_normalize
1677
1681
1678
- # If the date of surgery - date of ECG is > time window, skip it
1679
- if date_surg_table [mrn_int ] - ecg_date > datetime .timedelta (days = day_window ):
1680
- raise ValueError (f"ECG out of time window" )
1682
+ for dtm in tm .dependent_map :
1683
+ dependents [tm .dependent_map [dtm ]][outcome_table [dtm ][mrn_int ]] = 1.0
1681
1684
1682
- categorical_data [(i , patient_table [mrn_int ]) if dynamic else (patient_table [mrn_int ],)] = 1.0
1683
- return categorical_data
1685
+ return tensor
1684
1686
return tensor_from_file
1685
1687
1686
1688
@@ -1689,37 +1691,55 @@ def build_cardiac_surgery_tensor_maps(
1689
1691
) -> Dict [str , TensorMap ]:
1690
1692
name2tensormap : Dict [str , TensorMap ] = {}
1691
1693
outcome2column = {
1692
- "death " : "mtopd" ,
1693
- "stroke " : "cnstrokp" ,
1694
- "renal_failure " : "crenfail" ,
1695
- "prolonged_ventilation " : "crenfail" ,
1696
- "dsw_infection " : "deepsterninf" ,
1697
- "reoperation " : "reop" ,
1698
- "any_morbidity " : "anymorbidity" ,
1699
- "long_stay " : "llos" ,
1694
+ "sts_death " : "mtopd" ,
1695
+ "sts_stroke " : "cnstrokp" ,
1696
+ "sts_renal_failure " : "crenfail" ,
1697
+ "sts_prolonged_ventilation " : "crenfail" ,
1698
+ "sts_dsw_infection " : "deepsterninf" ,
1699
+ "sts_reoperation " : "reop" ,
1700
+ "sts_any_morbidity " : "anymorbidity" ,
1701
+ "sts_long_stay " : "llos" ,
1700
1702
}
1701
1703
1704
+ dependent_maps = {}
1702
1705
for outcome in outcome2column :
1703
- name = f"outcome_{ outcome } "
1704
- if name in needed_tensor_maps :
1705
- tensor_from_file_fxn = build_cardiac_surgery_outcome_tensor_from_file (
1706
- file_name = CARDIAC_SURGERY_OUTCOMES_CSV ,
1707
- outcome_column = outcome2column [outcome ],
1708
- day_window = 30 ,
1709
- )
1710
- name2tensormap [name ] = TensorMap (
1711
- name ,
1712
- Interpretation .CATEGORICAL ,
1713
- path_prefix = PARTNERS_PREFIX ,
1714
- channel_map = _outcome_channels (outcome ),
1715
- tensor_from_file = tensor_from_file_fxn ,
1716
- time_series_limit = 0 ,
1717
- )
1718
- name2tensormap [f'{ name } _newest' ] = TensorMap (
1719
- f'{ name } _newest' ,
1720
- Interpretation .CATEGORICAL ,
1721
- path_prefix = PARTNERS_PREFIX ,
1722
- channel_map = _outcome_channels (outcome ),
1723
- tensor_from_file = tensor_from_file_fxn ,
1724
- )
1706
+ channel_map = _outcome_channels (outcome )
1707
+ dependent_maps [outcome ] = TensorMap (outcome , Interpretation .CATEGORICAL , path_prefix = PARTNERS_PREFIX , channel_map = channel_map )
1708
+
1709
+ name = 'ecg_2500_sts'
1710
+ if name in needed_tensor_maps :
1711
+ tensor_from_file_fxn = build_cardiac_surgery_outcome_tensor_from_file (
1712
+ file_name = CARDIAC_SURGERY_OUTCOMES_CSV ,
1713
+ outcome2column = outcome2column ,
1714
+ population_normalize = 2000 ,
1715
+ day_window = 30 ,
1716
+ )
1717
+ name2tensormap [name ] = TensorMap (
1718
+ name ,
1719
+ shape = (2500 , 12 ),
1720
+ path_prefix = PARTNERS_PREFIX ,
1721
+ dependent_map = dependent_maps ,
1722
+ channel_map = ECG_REST_AMP_LEADS ,
1723
+ tensor_from_file = tensor_from_file_fxn ,
1724
+ )
1725
+ name = 'ecg_5000_sts'
1726
+ if name in needed_tensor_maps :
1727
+ tensor_from_file_fxn = build_cardiac_surgery_outcome_tensor_from_file (
1728
+ file_name = CARDIAC_SURGERY_OUTCOMES_CSV ,
1729
+ outcome2column = outcome2column ,
1730
+ population_normalize = 2000 ,
1731
+ day_window = 30 ,
1732
+ )
1733
+ name2tensormap [name ] = TensorMap (
1734
+ name ,
1735
+ shape = (5000 , 12 ),
1736
+ path_prefix = PARTNERS_PREFIX ,
1737
+ dependent_map = dependent_maps ,
1738
+ channel_map = ECG_REST_AMP_LEADS ,
1739
+ tensor_from_file = tensor_from_file_fxn ,
1740
+ )
1741
+ for outcome in outcome2column :
1742
+ if outcome in needed_tensor_maps :
1743
+ name2tensormap [outcome ] = dependent_maps [outcome ]
1744
+
1725
1745
return name2tensormap
0 commit comments