Source code for subcell_pipeline.simulation.post_processing
"""Methods for processing simulations."""importnumpyasnpimportpandasaspdfromio_collection.keys.check_keyimportcheck_keyfromio_collection.load.load_dataframeimportload_dataframefromio_collection.save.save_dataframeimportsave_dataframeSAMPLE_COLUMNS:list[str]=["xpos","ypos","zpos"]"""Columns names used when sampling simulation data."""
[docs]defsample_simulation_data(bucket:str,series_name:str,condition_keys:list[str],random_seeds:list[int],n_timepoints:int,n_monomer_points:int,)->None:""" Sample simulation data for select conditions and seeds at given resolution. Parameters ---------- bucket Name of S3 bucket for input and output files. series_name Name of simulation series. condition_keys List of condition keys. random_seeds Random seeds for simulations. n_timepoints Number of equally spaced timepoints to sample. n_monomer_points Number of equally spaced monomer points to sample. """forcondition_keyincondition_keys:series_key=f"{series_name}_{condition_key}"ifcondition_keyelseseries_nameforseedinrandom_seeds:data_key=f"{series_name}/data/{series_key}_{seed:06d}.csv"sampled_key=f"{series_name}/samples/{series_key}_{seed:06d}.csv"# Skip if dataframe file already exists.ifcheck_key(bucket,sampled_key):print(f"Sampled dataframe [ {sampled_key} ] already exists. Skipping.")continueprint(f"Sampling data for [ {condition_key} ] seed [ {seed} ]")full_data=load_dataframe(bucket,data_key)sampled_data=sample_simulation_data_points(full_data,n_timepoints,n_monomer_points)save_dataframe(bucket,sampled_key,sampled_data,index=False)
[docs]defsample_simulation_data_points(data:pd.DataFrame,n_timepoints:int,n_monomer_points:int,sampled_columns:list[str]=SAMPLE_COLUMNS,)->pd.DataFrame:""" Sample selected columns from simulation data at given resolution. Parameters ---------- data Full simulation data. n_timepoints Number of equally spaced timepoints to sample. n_monomer_points Number of equally spaced monomer points to sample. sampled_columns List of column names to sample. Returns ------- : Sampled simulation data. """all_sampled_points=[]unique_timepoints=data["time"].unique()n_unique_timepoints=unique_timepoints.sizetime_indices=np.rint(np.interp(np.linspace(0,1,n_timepoints+1),np.linspace(0,1,n_unique_timepoints),np.arange(n_unique_timepoints),)).astype(int)time_data=data[data["time"].isin(unique_timepoints[time_indices])]fortime,groupintime_data.groupby("time"):sampled_points=pd.DataFrame()sampled_points["fiber_point"]=np.arange(n_monomer_points)sampled_points["time"]=timeforcolumninsampled_columns:sampled_points[column]=np.interp(np.linspace(0,1,n_monomer_points),np.linspace(0,1,group.shape[0]),group[column].values,)all_sampled_points.append(sampled_points)returnpd.concat(all_sampled_points)