Skip to content

Clustering

Classes

Tak(array, index_patients, dict_label_id, timescale, evt_log)

Defines the TAK object.

Initialize the Tak class.

Parameters:

Name Type Description Default
array NDArray

1 row = 1 patient, 1 column = 1 timestamp

required
index_patients NDArray

patient IDs in the same order as in array

required
dict_label_id dict

dictionary mapping event names to their IDs

required
timescale int

time window size (in days); sequences may be resampled if != 1

required
evt_log DataFrame

initial event log used by TAK

required
Source code in opentak/clustering.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def __init__(
    self,
    array: npt.NDArray,
    index_patients: npt.NDArray,
    dict_label_id: dict,
    timescale: int,
    evt_log: pd.DataFrame,
):
    """Initialize the Tak class.

    :param array: 1 row = 1 patient, 1 column = 1 timestamp
    :param index_patients: patient IDs in the same order as in ``array``
    :param dict_label_id: dictionary mapping event names to their IDs
    :param timescale: time window size (in days); sequences may be resampled if ``!= 1``
    :param evt_log: initial event log used by TAK
    """
    self.array: npt.NDArray = array

    self.dict_label_id = dict_label_id
    self.dict_label_id["other"] = 100
    self.index_patients = index_patients
    self.is_fitted = False
    self.list_ids_clusters: list[Any] = []
    self.sorted_array: list[Any] = []
    self.timescale = timescale
    self.base = evt_log
Functions
fit()

Fit the TAK model (to be implemented by subclasses).

Source code in opentak/clustering.py
53
54
55
56
57
def fit(
    self,
) -> Tak:
    """Fit the TAK model (to be implemented by subclasses)."""
    raise NotImplementedError
get_list_indices_cluster(list_ids_cluster=None)

Compute indices in list_ids_cluster corresponding to patients in list_ids_cluster

Parameters:

Name Type Description Default
list_ids_cluster list | None

list of patient ids in the cluster format

None

Returns:

Type Description

list of arrays containing indices for each cluster

Source code in opentak/clustering.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def get_list_indices_cluster(self, list_ids_cluster: list | None = None):
    """Compute indices in list_ids_cluster corresponding to patients in list_ids_cluster

    :param list_ids_cluster: list of patient ids in the cluster format
    :return: list of arrays containing indices for each cluster
    """
    index_map = {pat: idx for idx, pat in enumerate(self.index_patients)}

    list_ids_cluster = (
        list_ids_cluster if list_ids_cluster is not None else self.list_ids_clusters
    )
    list_indices_cluster = [
        np.array([index_map[pat] for pat in cluster])
        for cluster in list_ids_cluster
    ]
    return list_indices_cluster
get_sorted_array(list_ids_cluster=None)

Compute the sorted array corresponding to list ids clusters

Parameters:

Name Type Description Default
list_ids_cluster list | None

list of patient ids in the cluster format, if None uses self.list_ids_clusters

None

Returns:

Type Description

list of arrays containing sorted sequences for each cluster

Source code in opentak/clustering.py
76
77
78
79
80
81
82
83
84
85
86
def get_sorted_array(self, list_ids_cluster: list | None = None):
    """Compute the sorted array corresponding to list ids clusters

    :param list_ids_cluster: list of patient ids in the cluster format, if None uses self.list_ids_clusters
    :return: list of arrays containing sorted sequences for each cluster
    """
    list_indices_cluster = self.get_list_indices_cluster(list_ids_cluster)
    sorted_array = [
        self.array[indices_cluster] for indices_cluster in list_indices_cluster
    ]
    return sorted_array

TakHca(array, index_patients, dict_label_id, timescale, evt_log)

Bases: Tak

Classic TAK using hierarchical clustering.

Initialize the TakHca class.

Parameters:

Name Type Description Default
array NDArray

1 line = 1 patient, 1 column = 1 timestamp

required
index_patients NDArray

patients IDs in the same order as the array matrix

required
dict_label_id dict

dictionary mapping the name of the event to its id

required
timescale int

time windows size (in days) (resampling if !=1)

required
evt_log DataFrame

Initial base used by the tak

required
Source code in opentak/clustering.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def __init__(
    self,
    array: npt.NDArray,
    index_patients: npt.NDArray,
    dict_label_id: dict,
    timescale: int,
    evt_log: pd.DataFrame,
):
    """Initialize the TakHca class.

    :param array: 1 line = 1 patient, 1 column = 1 timestamp
    :param index_patients: patients IDs in the same order as the array matrix
    :param dict_label_id: dictionary mapping the name of the event to its id
    :param timescale: time windows size (in days) (resampling if !=1)
    :param evt_log: Initial base used by the tak
    """
    super().__init__(array, index_patients, dict_label_id, timescale, evt_log)
    self.pdist: npt.NDArray | None = None
    self.pdist_uncondensed: npt.NDArray | None = None
    self.distance: _Metric = "hamming"
    self.method: LinkageMethod = "ward"
    self.linkage: npt.NDArray | None = None
    self.linkage_total: npt.NDArray | None = None
Functions
compute_pdist(distance='hamming', subset_array=None)

Compute pairwise distance between patients' sequences.

Parameters:

Name Type Description Default
distance _Metric

Computation method for pairwise distance. Default to "Hamming"

'hamming'
subset_array NDArray | None

subset of patients. If not provided the pairwise distance will be computed for all patients.

None

Returns:

Type Description
Tak

instance

Source code in opentak/clustering.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def compute_pdist(
    self,
    distance: _Metric = "hamming",
    subset_array: npt.NDArray | None = None,
) -> Tak:
    """Compute pairwise distance between patients' sequences.

    :param distance: Computation method for pairwise distance. Default to "Hamming"
    :param subset_array: subset of patients. If not provided the pairwise distance
    will be computed for all patients.
    :return: instance
    """
    if subset_array is None:
        subset_array = self.array

    self.distance = distance

    self.pdist = spatial.distance.pdist(subset_array, metric=self.distance)
    self.pdist_uncondensed = spatial.distance.squareform(self.pdist)
    return self
get_clusters(n_clusters=1, method='ward', patient_ids=None, optimal_ordering=True)

Clusters patients' sequences.

Parameters:

Name Type Description Default
n_clusters int

number of clusters to create

1
method LinkageMethod

linkage method ("ward", "single", "complete", "average")

'ward'
patient_ids Sequence | None

list of patients' ids to cluster, if None, all patients are used

None
optimal_ordering bool

reorder tree leaves (longer computation time)

True

Returns:

Type Description
tuple[ndarray, ndarray]

tuple of (cluster number for each patient, list of patient indices in optimal order)

Source code in opentak/clustering.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def get_clusters(
    self,
    n_clusters: int = 1,
    method: LinkageMethod = "ward",
    patient_ids: Sequence | None = None,
    optimal_ordering: bool = True,
) -> tuple[np.ndarray, np.ndarray]:
    """Clusters patients' sequences.

    :param n_clusters: number of clusters to create
    :param method: linkage method ("ward", "single", "complete", "average")
    :param patient_ids: list of patients' ids to cluster, if None, all patients are used
    :param optimal_ordering: reorder tree leaves (longer computation time)
    :return: tuple of (cluster number for each patient, list of patient indices in optimal order)
    """
    if patient_ids is None:
        # get computed distance matrix if already done
        pdist = self._check_pdist(pdist=None)
    else:
        if len(patient_ids) < n_clusters:
            raise ValueError(
                f"Cannot compute more clusters than patients {len(patient_ids)} "
                f"were provided for {n_clusters} clusters"
            )
        if len(patient_ids) == 1:
            return np.array([0]), np.array([0])

        # if a list of patient is provided, the pdist is already computed
        if self.pdist_uncondensed is None:
            raise ValueError(
                "self.pdist_uncondensed is None, call self.compute_pdist() first"
            )
        pdist_uncondensed_ids = self.pdist_uncondensed[patient_ids][:, patient_ids]
        pdist = spatial.distance.squareform(pdist_uncondensed_ids)
    linkage = self._get_linkage(
        pdist=pdist, method=method, optimal_ordering=optimal_ordering
    )

    if patient_ids is None or len(patient_ids) == len(self.index_patients):
        self.linkage_total = linkage

    list_indices_ordered = cluster.hierarchy.leaves_list(linkage)

    patients_groups_id = cluster.hierarchy.cut_tree(
        linkage, n_clusters=np.array([n_clusters])
    )[:, 0]

    return patients_groups_id, list_indices_ordered
fit(n_clusters=1, method='ward', distance='hamming', optimal_ordering=True)

Cluster patients' sequences.

Shorthand for: 1. Computing pairwise distances 2. Building the linkage matrix 3. Ordering patients by dendrogram leaves

Parameters:

Name Type Description Default
n_clusters int

number of clusters to create

1
method LinkageMethod

linkage method ("ward", "single", "complete", "average")

'ward'
distance _Metric

pairwise distance method

'hamming'
optimal_ordering bool

whether to reorder tree leaves (optimal ordering)

True

Returns:

Type Description
Tak

TAK fitted

Source code in opentak/clustering.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def fit(
    self,
    n_clusters: int = 1,
    method: LinkageMethod = "ward",
    distance: _Metric = "hamming",
    optimal_ordering: bool = True,
) -> Tak:
    """Cluster patients' sequences.

    Shorthand for:
    1. Computing pairwise distances
    2. Building the linkage matrix
    3. Ordering patients by dendrogram leaves

    :param n_clusters: number of clusters to create
    :param method: linkage method ("ward", "single", "complete", "average")
    :param distance: pairwise distance method
    :param optimal_ordering: whether to reorder tree leaves (optimal ordering)
    :return: TAK fitted
    """
    is_pdist_obsolete = self.pdist is None or (distance, method) != (
        self.distance,
        self.method,
    )

    if is_pdist_obsolete:
        self.compute_pdist(distance)

    patient_cluster_labels, list_indices = self.get_clusters(
        n_clusters=n_clusters, method=method, optimal_ordering=optimal_ordering
    )

    cluster_labels_ordered = patient_cluster_labels[list_indices]
    cluster_order_by_leaves = list(dict.fromkeys(cluster_labels_ordered.tolist()))
    list_ids_ordered = [
        list_indices[cluster_labels_ordered == c].tolist()
        for c in cluster_order_by_leaves
    ]
    list_ids_cluster_ordered = [
        [self.index_patients[i] for i in group] for group in list_ids_ordered
    ]

    self.list_ids_clusters = list_ids_cluster_ordered
    self.sorted_array = self.get_sorted_array()
    self.is_fitted = True
    return self