Skip to content

Generation cohort tak

Classes

GenerateCohortTAK(nb_patients=500, nb_days_end=365, random_state=None)

Initialize the number of patients and the maximum length of follow-up.

Parameters:

Name Type Description Default
nb_patients

number of patients in the cohort

500
nb_days_end

maximum number of days for which deliveries can be made

365
Source code in opentak/generation_cohort_tak.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
def __init__(
    self, nb_patients=500, nb_days_end=365, random_state: int | None = None
):
    """Initialize the number of patients and the maximum length of follow-up.

    :param nb_patients: number of patients in the cohort
    :param nb_days_end: maximum number of days for which deliveries can be made
    """
    self.nb_patients = nb_patients
    self.nb_days_end = nb_days_end
    self.random_state = random_state
    self.rng = np.random.default_rng(self.random_state)
Functions
initialisation_dataframe(treatment_name='A', dose_mean=30, dose_std=10)

Create an initial database with the correct columns and populate it with treatment_name entries.

Parameters:

Name Type Description Default
treatment_name

name of the treatment

'A'
dose_mean

theoretical dosage, used as average delivery interval

30
dose_std

standard deviation of the interval between doses, represents the difference found in practice between the dosage and the interval between doses.

10
Source code in opentak/generation_cohort_tak.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def initialisation_dataframe(self, treatment_name="A", dose_mean=30, dose_std=10):
    """Create an initial database with the correct columns and populate it with treatment_name entries.

    :param treatment_name: name of the treatment
    :param dose_mean: theoretical dosage, used as  average delivery interval
    :param dose_std: standard deviation of the interval between doses, represents the difference found in practice
    between the dosage and the interval between doses.
    """
    # Compute max deliveries
    nb_deliveries_max = int(self.nb_days_end / (dose_mean - dose_std / 2))

    # Compute number of days column based on dosage
    dose_real = self.rng.normal(
        dose_mean, dose_std, size=(self.nb_patients, nb_deliveries_max)
    ).astype(int)
    dose_real = np.where(dose_real <= 0, 1, dose_real)
    nbdays_not_flatten = np.cumsum(dose_real, axis=1)
    nbdays = nbdays_not_flatten.flatten()

    base = pd.DataFrame([])
    base["ID_PATIENT"] = np.repeat(range(self.nb_patients), nb_deliveries_max)
    base["TIMESTAMP"] = nbdays
    base["EVT"] = treatment_name
    base["DOSAGE"] = 30

    # remove nbdays later than nb_jours_max
    self.base = base[base["TIMESTAMP"].le(self.nb_days_end)]
    self.nb_rows_per_patient = (
        self.base.groupby("ID_PATIENT").count()["TIMESTAMP"].to_numpy()
    )

    return self.base
add_switch_linear(treatment_name, start_period_switch=None, end_period_switch=None, proportion_of_cohort=1)

Add a switch to the treatment_name medication, with a linear distribution from start_period_switch to end_period_switch for the nbdays in which it appears.

Parameters:

Name Type Description Default
treatment_name

name of the treatment

required
start_period_switch

smallest nbdays of switch

None
end_period_switch

largest nbdays os switch

None
proportion_of_cohort

proportion of the cohort affected by the switch

1
Source code in opentak/generation_cohort_tak.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def add_switch_linear(
    self,
    treatment_name,
    start_period_switch=None,
    end_period_switch=None,
    proportion_of_cohort=1,
):
    # ruff: noqa: D205
    """Add a switch to the treatment_name medication, with a linear distribution from start_period_switch to
       end_period_switch for the nbdays in which it appears.

    :param treatment_name: name of the treatment
    :param start_period_switch: smallest nbdays of switch
    :param end_period_switch: largest nbdays os switch
    :param proportion_of_cohort: proportion of the cohort affected by the switch
    """
    # Calculation of a start_period_switch and an end_period_switch if they are not provided by the user
    if start_period_switch is None:
        start_period_switch = int(self.nb_days_end / 3)
    if end_period_switch is None:
        end_period_switch = int(2 * self.nb_days_end / 3)

    # calculation of the distribution of change days to treatment_name
    distrib_nbdays_switch = self.rng.integers(
        start_period_switch, end_period_switch, size=self.nb_patients
    )

    # Applies the switch to proportion_of_cohort and adds the lines to the dataframe.
    return self._add_switch(
        treatment_name, distrib_nbdays_switch, proportion_of_cohort
    )
add_switch_gaussien(treatment_name, mean=None, std=None, proportion_of_cohort=1)

Add a switch to the treatment_name medication, with a Gaussian distribution for the nbdays at which it appears.

Parameters:

Name Type Description Default
treatment_name

name of the treatment

required
mean

mean of the Gaussian distribution of switch nbdays

None
std

standard deviation of the Gaussian distribution of switch nbdays

None
proportion_of_cohort

proportion of the cohort affected by the switch

1
Source code in opentak/generation_cohort_tak.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def add_switch_gaussien(
    self, treatment_name, mean=None, std=None, proportion_of_cohort=1
):
    # ruff: noqa: D205
    """Add a switch to the treatment_name medication, with a Gaussian distribution for the nbdays
    at which it appears.

    :param treatment_name: name of the treatment
    :param mean: mean of the Gaussian distribution of switch nbdays
    :param std: standard deviation of the Gaussian distribution of switch nbdays
    :param proportion_of_cohort: proportion of the cohort affected by the switch
    """
    # Calculating a mean and standard deviation if they are not provided by the user
    if mean is None:
        mean = int(self.nb_days_end / 2)
    if std is None:
        std = int(self.nb_days_end / 8)

    # Calculation of the distribution of change days to treatment_name
    distrib_nbdays_switch = self.rng.normal(mean, std, size=self.nb_patients)

    # Applies the switch to proportion_of_cohort and adds the lines to the dataframe.
    return self._add_switch(
        treatment_name, distrib_nbdays_switch, proportion_of_cohort
    )
add_drug_holidays(start_dh_min=None, start_dh_max=None, duration_dh_min=None, duration_dh_max=None, proportion_of_cohort=1)

Remove deliveries in order to show drug holidays.

Parameters:

Name Type Description Default
start_dh_min int | None

minimum number of days from the start of the drug holiday period

None
start_dh_max int | None

maximum number of days from the start of the drug holiday period

None
duration_dh_min int | None

minimum duration of the drug holiday period

None
duration_dh_max int | None

maximum duration of the drug holiday period

None
proportion_of_cohort

proportion of the cohort affected by the drug holidays

1
Source code in opentak/generation_cohort_tak.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def add_drug_holidays(
    self,
    start_dh_min: int | None = None,
    start_dh_max: int | None = None,
    duration_dh_min: int | None = None,
    duration_dh_max: int | None = None,
    proportion_of_cohort=1,
):
    """Remove deliveries in order to show drug holidays.

    :param start_dh_min: minimum number of days from the start of the drug holiday period
    :param start_dh_max: maximum number of days from the start of the drug holiday period
    :param duration_dh_min: minimum duration of the drug holiday period
    :param duration_dh_max: maximum duration of the drug holiday period
    :param proportion_of_cohort: proportion of the cohort affected by the drug holidays
    """
    # Calculation of start_dh_min and start_dh_max if not provided by the user
    if start_dh_min is None:
        start_dh_min = int(self.nb_days_end / 3)
    if start_dh_max is None:
        start_dh_max = int(2 * self.nb_days_end / 3)

    # Calculation of a start_period_switch and an end_period_switch if they are not provided by the user
    if duration_dh_min is None:
        duration_dh_min = int(self.nb_days_end / 6)
    if duration_dh_max is None:
        duration_dh_max = int(1.2 * self.nb_days_end / 6)

    # Calculation of the distribution of treatment interruption days
    distrib_nbdays_start_dh = self.rng.integers(
        start_dh_min, start_dh_max, size=self.nb_patients
    )
    duration_dh = self.rng.integers(
        duration_dh_min, duration_dh_max, size=self.nb_patients
    )
    distrib_nbdays_end_dh = distrib_nbdays_start_dh + duration_dh

    # check that proportion_of_cohort belongs to the segment [0,1]
    if proportion_of_cohort < 0 or proportion_of_cohort > 1:
        raise AttributeError("proportion_of_cohort should be between 0 and 1")

    # Random selection of patients who will not receive this switch
    index_droped = self.rng.choice(
        self.nb_patients,
        int((1 - proportion_of_cohort) * self.nb_patients),
        replace=False,
    )
    distrib_nbdays_end_dh[index_droped] = distrib_nbdays_start_dh[index_droped]

    # Add the treatment_name lines corresponding to the switch in the database.
    self.base["nbdays_start_dh"] = np.repeat(
        distrib_nbdays_start_dh, self.nb_rows_per_patient
    )
    self.base["nbdays_end_dh"] = np.repeat(
        distrib_nbdays_end_dh, self.nb_rows_per_patient
    )
    self.base = self.base[
        self.base["TIMESTAMP"].le(self.base["nbdays_start_dh"])
        | self.base["TIMESTAMP"].ge(self.base["nbdays_end_dh"])
    ]
    self.base = self.base.drop(["nbdays_start_dh", "nbdays_end_dh"], axis=1)

    self._update_nb_rows_per_patient()

    return self.base
drop_missing_deliveries(proba_suppression_delivery=0.05)

Randomly deletes entries from the database.

Parameters:

Name Type Description Default
proba_suppression_delivery

proportion of deliveries to be removed from the database

0.05
Source code in opentak/generation_cohort_tak.py
208
209
210
211
212
213
214
215
216
217
218
219
220
def drop_missing_deliveries(self, proba_suppression_delivery=0.05):
    """Randomly deletes entries from the database.

    :param proba_suppression_delivery: proportion of deliveries to be removed from the database
    """
    # Remove deliveries at random
    self.base = self.base.loc[
        self.rng.random(len(self.base)) > proba_suppression_delivery, :
    ]

    self._update_nb_rows_per_patient()

    return self.base
add_in_out(proba_death=0)

Add a “in” to nbdays = 0 at the start of each patient and an “out” or “death” to nb_days_end+1 for each patient.

Parameters:

Name Type Description Default
proba_death

proportion of deaths in the cohort

0
Source code in opentak/generation_cohort_tak.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def add_in_out(self, proba_death=0):
    """Add a “in” to nbdays = 0 at the start of each patient and an “out” or “death” to nb_days_end+1 for each
    patient.

    :param proba_death: proportion of deaths in the cohort
    """
    # add 'in'
    base_in = pd.DataFrame(list(range(self.nb_patients)), columns=["ID_PATIENT"])
    base_in["TIMESTAMP"] = 0
    base_in["EVT"] = "in"
    base_in["DOSE"] = 0

    # add out and death
    base_out = pd.DataFrame(list(range(self.nb_patients)), columns=["ID_PATIENT"])
    base_out["TIMESTAMP"] = self.rng.integers(
        self.nb_days_end + 1, size=len(base_out)
    )
    base_out["EVT"] = self.rng.choice(
        ["out", "death"], self.nb_patients, p=[1 - proba_death, proba_death]
    )
    base_out.loc[base_out["EVT"].eq("out"), "TIMESTAMP"] = self.nb_days_end + 1
    base_out["DOSE"] = np.nan

    # remove element appearing after death or out
    self.base["end"] = np.repeat(
        base_out["TIMESTAMP"].values, self.nb_rows_per_patient
    )
    self.base = self.base[self.base["TIMESTAMP"].le(self.base["end"])]
    self.base = self.base.drop("end", axis=1)

    self.base = pd.concat([base_in, self.base, base_out]).sort_values(
        ["ID_PATIENT", "TIMESTAMP"], kind="mergesort"
    )

    self._update_nb_rows_per_patient()

    return self.base