Clustering

Classes¶

`Tak(array, index_patients, dict_label_id, timescale, evt_log)` ¶

Defines the TAK object.

Initialize the Tak class.

Parameters:

Name	Type	Description	Default
`array`	`NDArray`	1 row = 1 patient, 1 column = 1 timestamp	required
`index_patients`	`NDArray`	patient IDs in the same order as in `array`	required
`dict_label_id`	`dict`	dictionary mapping event names to their IDs	required
`timescale`	`int`	time window size (in days); sequences may be resampled if `!= 1`	required
`evt_log`	`DataFrame`	initial event log used by TAK	required

Source code in opentak/clustering.py

def __init__(
    self,
    array: npt.NDArray,
    index_patients: npt.NDArray,
    dict_label_id: dict,
    timescale: int,
    evt_log: pd.DataFrame,
):
    """Initialize the Tak class.

    :param array: 1 row = 1 patient, 1 column = 1 timestamp
    :param index_patients: patient IDs in the same order as in ``array``
    :param dict_label_id: dictionary mapping event names to their IDs
    :param timescale: time window size (in days); sequences may be resampled if ``!= 1``
    :param evt_log: initial event log used by TAK
    """
    self.array: npt.NDArray = array

    self.dict_label_id = dict_label_id
    self.dict_label_id["other"] = 100
    self.index_patients = index_patients
    self.is_fitted = False
    self.list_ids_clusters: list[Any] = []
    self.sorted_array: list[Any] = []
    self.timescale = timescale
    self.base = evt_log

Functions¶

`fit()` ¶

Fit the TAK model (to be implemented by subclasses).

Source code in opentak/clustering.py

def fit(
    self,
) -> Tak:
    """Fit the TAK model (to be implemented by subclasses)."""
    raise NotImplementedError

`get_list_indices_cluster(list_ids_cluster=None)` ¶

Compute indices in list_ids_cluster corresponding to patients in list_ids_cluster

Parameters:

Name	Type	Description	Default
`list_ids_cluster`	`list \| None`	list of patient ids in the cluster format	`None`

Returns:

Type	Description
	list of arrays containing indices for each cluster

Source code in opentak/clustering.py

def get_list_indices_cluster(self, list_ids_cluster: list | None = None):
    """Compute indices in list_ids_cluster corresponding to patients in list_ids_cluster

    :param list_ids_cluster: list of patient ids in the cluster format
    :return: list of arrays containing indices for each cluster
    """
    index_map = {pat: idx for idx, pat in enumerate(self.index_patients)}

    list_ids_cluster = (
        list_ids_cluster if list_ids_cluster is not None else self.list_ids_clusters
    )
    list_indices_cluster = [
        np.array([index_map[pat] for pat in cluster])
        for cluster in list_ids_cluster
    ]
    return list_indices_cluster

`get_sorted_array(list_ids_cluster=None)` ¶

Compute the sorted array corresponding to list ids clusters

Parameters:

Name	Type	Description	Default
`list_ids_cluster`	`list \| None`	list of patient ids in the cluster format, if None uses self.list_ids_clusters	`None`

Returns:

Type	Description
	list of arrays containing sorted sequences for each cluster

Source code in opentak/clustering.py

def get_sorted_array(self, list_ids_cluster: list | None = None):
    """Compute the sorted array corresponding to list ids clusters

    :param list_ids_cluster: list of patient ids in the cluster format, if None uses self.list_ids_clusters
    :return: list of arrays containing sorted sequences for each cluster
    """
    list_indices_cluster = self.get_list_indices_cluster(list_ids_cluster)
    sorted_array = [
        self.array[indices_cluster] for indices_cluster in list_indices_cluster
    ]
    return sorted_array

`TakHca(array, index_patients, dict_label_id, timescale, evt_log)` ¶

Bases: Tak

Classic TAK using hierarchical clustering.

Initialize the TakHca class.

Parameters:

Name	Type	Description	Default
`array`	`NDArray`	1 line = 1 patient, 1 column = 1 timestamp	required
`index_patients`	`NDArray`	patients IDs in the same order as the array matrix	required
`dict_label_id`	`dict`	dictionary mapping the name of the event to its id	required
`timescale`	`int`	time windows size (in days) (resampling if !=1)	required
`evt_log`	`DataFrame`	Initial base used by the tak	required

Source code in opentak/clustering.py

def __init__(
    self,
    array: npt.NDArray,
    index_patients: npt.NDArray,
    dict_label_id: dict,
    timescale: int,
    evt_log: pd.DataFrame,
):
    """Initialize the TakHca class.

    :param array: 1 line = 1 patient, 1 column = 1 timestamp
    :param index_patients: patients IDs in the same order as the array matrix
    :param dict_label_id: dictionary mapping the name of the event to its id
    :param timescale: time windows size (in days) (resampling if !=1)
    :param evt_log: Initial base used by the tak
    """
    super().__init__(array, index_patients, dict_label_id, timescale, evt_log)
    self.pdist: npt.NDArray | None = None
    self.pdist_uncondensed: npt.NDArray | None = None
    self.distance: _Metric = "hamming"
    self.method: LinkageMethod = "ward"
    self.linkage: npt.NDArray | None = None
    self.linkage_total: npt.NDArray | None = None

Functions¶

`compute_pdist(distance='hamming', subset_array=None)` ¶

Compute pairwise distance between patients' sequences.

Parameters:

Name	Type	Description	Default
`distance`	`_Metric`	Computation method for pairwise distance. Default to "Hamming"	`'hamming'`
`subset_array`	`NDArray \| None`	subset of patients. If not provided the pairwise distance will be computed for all patients.	`None`

Returns:

Type	Description
`Tak`	instance

Source code in opentak/clustering.py

def compute_pdist(
    self,
    distance: _Metric = "hamming",
    subset_array: npt.NDArray | None = None,
) -> Tak:
    """Compute pairwise distance between patients' sequences.

    :param distance: Computation method for pairwise distance. Default to "Hamming"
    :param subset_array: subset of patients. If not provided the pairwise distance
    will be computed for all patients.
    :return: instance
    """
    if subset_array is None:
        subset_array = self.array

    self.distance = distance

    self.pdist = spatial.distance.pdist(subset_array, metric=self.distance)
    self.pdist_uncondensed = spatial.distance.squareform(self.pdist)
    return self

`get_clusters(n_clusters=1, method='ward', patient_ids=None, optimal_ordering=True)` ¶

Clusters patients' sequences.

Parameters:

Name	Type	Description	Default
`n_clusters`	`int`	number of clusters to create	`1`
`method`	`LinkageMethod`	linkage method ("ward", "single", "complete", "average")	`'ward'`
`patient_ids`	`Sequence \| None`	list of patients' ids to cluster, if None, all patients are used	`None`
`optimal_ordering`	`bool`	reorder tree leaves (longer computation time)	`True`

Returns:

Type	Description
`tuple[ndarray, ndarray]`	tuple of (cluster number for each patient, list of patient indices in optimal order)

Source code in opentak/clustering.py

def get_clusters(
    self,
    n_clusters: int = 1,
    method: LinkageMethod = "ward",
    patient_ids: Sequence | None = None,
    optimal_ordering: bool = True,
) -> tuple[np.ndarray, np.ndarray]:
    """Clusters patients' sequences.

    :param n_clusters: number of clusters to create
    :param method: linkage method ("ward", "single", "complete", "average")
    :param patient_ids: list of patients' ids to cluster, if None, all patients are used
    :param optimal_ordering: reorder tree leaves (longer computation time)
    :return: tuple of (cluster number for each patient, list of patient indices in optimal order)
    """
    if patient_ids is None:
        # get computed distance matrix if already done
        pdist = self._check_pdist(pdist=None)
    else:
        if len(patient_ids) < n_clusters:
            raise ValueError(
                f"Cannot compute more clusters than patients {len(patient_ids)} "
                f"were provided for {n_clusters} clusters"
            )
        if len(patient_ids) == 1:
            return np.array([0]), np.array([0])

        # if a list of patient is provided, the pdist is already computed
        if self.pdist_uncondensed is None:
            raise ValueError(
                "self.pdist_uncondensed is None, call self.compute_pdist() first"
            )
        pdist_uncondensed_ids = self.pdist_uncondensed[patient_ids][:, patient_ids]
        pdist = spatial.distance.squareform(pdist_uncondensed_ids)
    linkage = self._get_linkage(
        pdist=pdist, method=method, optimal_ordering=optimal_ordering
    )

    if patient_ids is None or len(patient_ids) == len(self.index_patients):
        self.linkage_total = linkage

    list_indices_ordered = cluster.hierarchy.leaves_list(linkage)

    patients_groups_id = cluster.hierarchy.cut_tree(
        linkage, n_clusters=np.array([n_clusters])
    )[:, 0]

    return patients_groups_id, list_indices_ordered

`fit(n_clusters=1, method='ward', distance='hamming', optimal_ordering=True)` ¶

Cluster patients' sequences.

Shorthand for: 1. Computing pairwise distances 2. Building the linkage matrix 3. Ordering patients by dendrogram leaves

Parameters:

Name	Type	Description	Default
`n_clusters`	`int`	number of clusters to create	`1`
`method`	`LinkageMethod`	linkage method ("ward", "single", "complete", "average")	`'ward'`
`distance`	`_Metric`	pairwise distance method	`'hamming'`
`optimal_ordering`	`bool`	whether to reorder tree leaves (optimal ordering)	`True`

Returns:

Type	Description
`Tak`	TAK fitted

Source code in opentak/clustering.py

def fit(
    self,
    n_clusters: int = 1,
    method: LinkageMethod = "ward",
    distance: _Metric = "hamming",
    optimal_ordering: bool = True,
) -> Tak:
    """Cluster patients' sequences.

    Shorthand for:
    1. Computing pairwise distances
    2. Building the linkage matrix
    3. Ordering patients by dendrogram leaves

    :param n_clusters: number of clusters to create
    :param method: linkage method ("ward", "single", "complete", "average")
    :param distance: pairwise distance method
    :param optimal_ordering: whether to reorder tree leaves (optimal ordering)
    :return: TAK fitted
    """
    is_pdist_obsolete = self.pdist is None or (distance, method) != (
        self.distance,
        self.method,
    )

    if is_pdist_obsolete:
        self.compute_pdist(distance)

    patient_cluster_labels, list_indices = self.get_clusters(
        n_clusters=n_clusters, method=method, optimal_ordering=optimal_ordering
    )

    cluster_labels_ordered = patient_cluster_labels[list_indices]
    cluster_order_by_leaves = list(dict.fromkeys(cluster_labels_ordered.tolist()))
    list_ids_ordered = [
        list_indices[cluster_labels_ordered == c].tolist()
        for c in cluster_order_by_leaves
    ]
    list_ids_cluster_ordered = [
        [self.index_patients[i] for i in group] for group in list_ids_ordered
    ]

    self.list_ids_clusters = list_ids_cluster_ordered
    self.sorted_array = self.get_sorted_array()
    self.is_fitted = True
    return self

Clustering

Classes¶

Tak(array, index_patients, dict_label_id, timescale, evt_log) ¶

Functions¶

fit() ¶

get_list_indices_cluster(list_ids_cluster=None) ¶

get_sorted_array(list_ids_cluster=None) ¶

TakHca(array, index_patients, dict_label_id, timescale, evt_log) ¶

Functions¶

compute_pdist(distance='hamming', subset_array=None) ¶

get_clusters(n_clusters=1, method='ward', patient_ids=None, optimal_ordering=True) ¶

fit(n_clusters=1, method='ward', distance='hamming', optimal_ordering=True) ¶

`Tak(array, index_patients, dict_label_id, timescale, evt_log)` ¶

`fit()` ¶

`get_list_indices_cluster(list_ids_cluster=None)` ¶

`get_sorted_array(list_ids_cluster=None)` ¶

`TakHca(array, index_patients, dict_label_id, timescale, evt_log)` ¶

`compute_pdist(distance='hamming', subset_array=None)` ¶

`get_clusters(n_clusters=1, method='ward', patient_ids=None, optimal_ordering=True)` ¶

`fit(n_clusters=1, method='ward', distance='hamming', optimal_ordering=True)` ¶