allocate population equally across districts at initialisation (#1331)

tdm32 · tbhallett · web-flow · commit b844efeb1b94 · 2024-05-14T13:47:54.000+01:00
* fix failing test

* fix unused import statement

* edit optional dependency in demography.py

* roll back simulation.py

* put kwarg in demography.py

* update test

* roll back incidental change

* factorize calc

* add is_alive

* roll back incidental changes

* make static for clarity

* roll back incidental changes

---------

Co-authored-by: Tim Hallett &lt;39991060+tbhallett@users.noreply.github.com&gt;
diff --git a/src/tlo/methods/demography.py b/src/tlo/methods/demography.py
@@ -73,9 +73,10 @@ class Demography(Module):
     The core demography module.
     """
 
-    def __init__(self, name=None, resourcefilepath=None):
+    def __init__(self, name=None, resourcefilepath=None, equal_allocation_by_district: bool = False):
         super().__init__(name)
         self.resourcefilepath = resourcefilepath
+        self.equal_allocation_by_district = equal_allocation_by_district
         self.initial_model_to_data_popsize_ratio = None  # will store scaling factor
         self.popsize_by_year = dict()  # will store total population size each year
         self.causes_of_death = dict()  # will store all the causes of death that are possible in the simulation
@@ -245,6 +246,8 @@ def initialise_population(self, population):
             init_pop,
             max_age=self.parameters['max_age_initial']
         )
+        if self.equal_allocation_by_district:
+            init_pop = self._edit_init_pop_so_that_equal_number_in_each_district(init_pop)
 
         # randomly pick from the init_pop sheet, to allocate characteristic to each person in the df
         demog_char_to_assign = init_pop.iloc[self.rng.choice(init_pop.index.values,
@@ -381,6 +384,56 @@ def _edit_init_pop_to_prevent_persons_greater_than_max_age(self, df, max_age: in
         _df.prob = _df.prob / _df.prob.sum()  # Rescale `prob` so that it sums to 1.0
         return _df.reset_index(drop=True)
 
+    @staticmethod
+    def _edit_init_pop_so_that_equal_number_in_each_district(df) -> pd.DataFrame:
+        """Return an edited version of the `pd.DataFrame` describing the probability of persons in the population being
+        created with certain characteristics to reflect the constraint of there being an equal number of persons
+        in each district."""
+
+        # Get breakdown of Sex/Age within each district
+        district_nums = df['District_Num'].unique()
+
+        # Target size of each district
+        target_size_for_district = df['Count'].sum() / len(district_nums)
+
+        # Make new version (a copy) of the dataframe
+        df_new = df.copy()
+
+        for district_num in district_nums:
+            mask_for_district = df['District_Num'] == district_num
+            # For each district, compute the age/sex breakdown, and use this with target_size to create updated `Count`
+            # values
+            df_new.loc[mask_for_district, 'Count'] = target_size_for_district * (
+                df.loc[mask_for_district, 'Count'] / df.loc[mask_for_district, 'Count'].sum()
+            )
+
+        # Recompute "prob" column (i.e. the probability of being in that category)
+        df_new["prob"] = df_new['Count'] / df_new['Count'].sum()
+
+        # Check that the resulting dataframe is of the same size/shape as the original; that Count and prob make
+        # sense; and that we have preserved the age/sex breakdown within each district
+        def all_elements_identical(x):
+            return np.allclose(x, x[0])
+
+        assert df['Count'].sum() == df_new['Count'].sum()
+        assert 1.0 == df['prob'].sum() == df_new['prob'].sum()
+        assert all_elements_identical(df_new.groupby('District_Num')['prob'].sum().values)
+
+        def get_age_sex_breakdown_in_district(dat, district_num):
+            return (
+                dat.loc[df['District_Num'] == district_num].groupby(['Age', 'Sex'])['prob'].sum()
+                / dat.loc[df['District_Num'] == district_num, 'prob'].sum()
+            )
+
+        for _d in district_nums:
+            pd.testing.assert_series_equal(
+                get_age_sex_breakdown_in_district(df, _d),
+                get_age_sex_breakdown_in_district(df_new, _d)
+            )
+
+        # Return the new dataframe
+        return df_new
+
     def process_causes_of_death(self):
         """
         1) Register all causes of deaths defined by Module
diff --git a/tests/test_demography.py b/tests/test_demography.py
@@ -374,3 +374,31 @@ def test_ageing_of_old_people_up_to_max_age(simulation):
     # All persons should have died, with a cause of 'Other'
     assert not df.loc[ever_alive].is_alive.any()
     assert (df.loc[ever_alive, 'cause_of_death'] == 'Other').all()
+
+
+def test_equal_allocation_by_district(seed):
+    """
+    Check when key-word argument `equal_allocation_by_district=True` that each district has an identical population size
+    """
+
+    resourcefilepath = Path(os.path.dirname(__file__)) / '../resources'
+    sim = Simulation(start_date=start_date, seed=seed)
+    sim.register(
+        demography.Demography(
+            resourcefilepath=resourcefilepath,
+            equal_allocation_by_district=True,
+        )
+    )
+    population_per_district = 10_000
+    number_of_districts = len(sim.modules['Demography'].districts)
+    popsize = number_of_districts * population_per_district
+    sim.make_initial_population(n=popsize)
+    sim.simulate(end_date=sim.start_date)  # Simulate for zero days
+
+    # check population size
+    df = sim.population.props
+    assert sum(df.is_alive) == popsize
+
+    # check total within each district is (close to being) identical and matches the target population of each district
+    pop_size_by_district = df.loc[df.is_alive].groupby('district_of_residence').size()
+    assert np.allclose(pop_size_by_district.values, pop_size_by_district, rtol=0.05)