Coverage for subcell_pipeline/simulation/readdy/parser.py: 0%
79 statements
« prev ^ index » next coverage.py v7.5.3, created at 2024-08-29 15:14 +0000
« prev ^ index » next coverage.py v7.5.3, created at 2024-08-29 15:14 +0000
1"""Methods for parsing ReaDDy simulations."""
3import os
4from math import floor, log10
5from typing import Optional, Union
7import boto3
8import numpy as np
9import pandas as pd
10from botocore.exceptions import ClientError
11from io_collection.keys.check_key import check_key
12from io_collection.save.save_dataframe import save_dataframe
14from subcell_pipeline.simulation.readdy.loader import ReaddyLoader
15from subcell_pipeline.simulation.readdy.post_processor import ReaddyPostProcessor
17COLUMN_NAMES: list[str] = [
18 "fiber_id",
19 "xpos",
20 "ypos",
21 "zpos",
22 "xforce",
23 "yforce",
24 "zforce",
25 "segment_curvature",
26 "time",
27 "fiber_point",
28]
29"""Parsed tidy data column names."""
31COLUMN_DTYPES: dict[str, Union[type[float], type[int]]] = {
32 "fiber_id": int,
33 "xpos": float,
34 "ypos": float,
35 "zpos": float,
36 "xforce": float,
37 "yforce": float,
38 "zforce": float,
39 "segment_curvature": float,
40 "time": float,
41 "fiber_point": int,
42}
43"""Parsed tidy data column data types."""
45READDY_TIMESTEP: float = 0.1
46"""Simulation timestep (in ns)."""
48BOX_SIZE: np.ndarray = np.array(3 * [600.0])
49"""Default simulation volume dimensions (x, y, z)."""
51COMPRESSION_DISTANCE: float = 150.0
52"""Total distance the fiber end was displaced in nm."""
55def _download_s3_file(bucket: str, key: str, dest_path: str) -> Optional[str]:
56 """
57 Download file from S3 to local path.
59 Parameters
60 ----------
61 bucket
62 Name of S3 bucket.
63 key
64 Source key.
65 dest_path
66 Target local path.
67 """
69 s3_client = boto3.client("s3")
71 if os.path.isfile(dest_path):
72 return dest_path
73 try:
74 s3_client.download_file(bucket, key, dest_path)
75 print(f"Downloaded [ {key} ] to [ {dest_path} ].")
76 return dest_path
77 except ClientError:
78 print(f"!!! Failed to download {key}")
79 return None
82def download_readdy_hdf5(
83 bucket: str,
84 series_name: str,
85 series_key: str,
86 rep_ix: int,
87 download_path: str,
88) -> Optional[str]:
89 """
90 Download ReaDDy h5 files from S3 to local path.
92 The ReaDDy Python package currently requires a local file path.
94 Parameters
95 ----------
96 bucket
97 Name of S3 bucket for input and output files.
98 series_name
99 Name of simulation series.
100 series_key
101 Combination of series and condition names.
102 rep_ix
103 Replicate index.
104 download_path
105 Path for downloading temporary h5 files.
106 """
108 if bucket.startswith("s3://"):
109 bucket = bucket.replace("s3://", "")
111 aws_h5_key = f"{series_name}/outputs/{series_key}_{rep_ix}.h5"
112 local_h5_path = os.path.join(download_path, f"{series_key}_{rep_ix}.h5")
113 return _download_s3_file(bucket, aws_h5_key, local_h5_path)
116def parse_readdy_simulation_single_fiber_trajectory(
117 bucket: str,
118 series_name: str,
119 series_key: str,
120 rep_ix: int,
121 n_timepoints: int,
122 n_monomer_points: int,
123 total_steps: int,
124 temp_path: str,
125 timestep: float = READDY_TIMESTEP,
126) -> pd.DataFrame:
127 """
128 Parse ReaDDy trajectory data into tidy data format.
130 Note that this methods assumes there is only one fiber in the simulation.
132 Parameters
133 ----------
134 bucket
135 Name of S3 bucket for input and output files.
136 series_name
137 Name of simulation.
138 series_key
139 Series key.
140 rep_ix
141 Replicate index.
142 n_timepoints
143 Number of equally spaced timepoints to sample.
144 n_monomer_points
145 Number of equally spaced monomer points to sample.
146 total_steps
147 Total number of steps for each given simulation.
148 temp_path
149 Path for saving temporary h5 files.
150 timestep
151 Simulation timestep (in ns).
152 """
154 h5_file_path = download_readdy_hdf5(
155 bucket, series_name, series_key, rep_ix, temp_path
156 )
158 assert isinstance(h5_file_path, str)
160 rep_id = rep_ix + 1
161 pickle_key = f"{series_name}/data/{series_key}_{rep_id:06d}.pkl"
162 time_inc = total_steps // n_timepoints
164 readdy_loader = ReaddyLoader(
165 h5_file_path=h5_file_path,
166 time_inc=time_inc,
167 timestep=timestep,
168 pickle_location=bucket,
169 pickle_key=pickle_key,
170 )
172 post_processor = ReaddyPostProcessor(readdy_loader.trajectory(), box_size=BOX_SIZE)
174 times = post_processor.times()
175 fiber_chain_ids = post_processor.linear_fiber_chain_ids(polymer_number_range=5)
176 axis_positions, _ = post_processor.linear_fiber_axis_positions(fiber_chain_ids)
178 fiber_points = post_processor.linear_fiber_control_points(
179 axis_positions=axis_positions,
180 n_points=n_monomer_points,
181 )
183 point_data: list[list[Union[str, int, float]]] = []
184 for time_ix in range(len(fiber_points)):
185 for pos_ix in range(fiber_points[0][0].shape[0]):
186 point_data.append(
187 [
188 1, # fiber_id
189 fiber_points[time_ix][0][pos_ix][0], # xpos
190 fiber_points[time_ix][0][pos_ix][1], # ypos
191 fiber_points[time_ix][0][pos_ix][2], # zpos
192 0.0, # xforce
193 0.0, # yforce
194 0.0, # zforce
195 0.0, # segment_curvature
196 times[time_ix], # time
197 pos_ix, # fiber_point
198 ]
199 )
201 # Combine all data into dataframe and update data types.
202 dataframe = pd.DataFrame(point_data, columns=COLUMN_NAMES)
203 dataframe = dataframe.astype(dtype=COLUMN_DTYPES)
205 # Add placeholders for features not calculated in ReaDDy
206 dataframe["force_magnitude"] = np.array(len(point_data) * [0.0])
207 dataframe["segment_energy"] = np.array(len(point_data) * [0.0])
209 return dataframe
212def _round_2_sig_figs(x: float) -> int:
213 """Round value to two sigfigs."""
214 return int(round(x, -int(floor(log10(abs(0.1 * x))))))
217def _velocity_for_cond(condition_key: str) -> float:
218 """Convert `NNNN` condition key to `NNN.N` velocity."""
219 return float(condition_key[:3] + "." + condition_key[-1])
222def parse_readdy_simulation_data(
223 bucket: str,
224 series_name: str,
225 condition_keys: list[str],
226 n_replicates: int,
227 n_timepoints: int,
228 n_monomer_points: int,
229 compression: bool,
230 temp_path: str,
231) -> None:
232 """
233 Parse ReaDDy simulation data for select conditions and replicates.
235 Parameters
236 ----------
237 bucket
238 Name of S3 bucket for input and output files.
239 series_name
240 Name of simulation series.
241 condition_keys
242 List of condition keys.
243 n_replicates
244 Number of simulation replicates.
245 n_timepoints
246 Number of equally spaced timepoints to sample.
247 n_monomer_points
248 Number of equally spaced monomer points to sample.
249 compression
250 True if simulations are compressed, False otherwise.
251 temp_path
252 Path for saving temporary h5 files.
253 """
254 total_steps: dict[str, int] = {}
255 if compression:
256 total_steps = {
257 cond: _round_2_sig_figs(
258 (COMPRESSION_DISTANCE * 1e-3 / _velocity_for_cond(cond)) * 1e10
259 )
260 for cond in condition_keys
261 }
262 else:
263 total_steps = {"": int(1e7)}
265 for condition_key in condition_keys:
266 series_key = f"{series_name}_{condition_key}" if condition_key else series_name
268 for rep_ix in range(n_replicates):
269 rep_id = rep_ix + 1
270 dataframe_key = f"{series_name}/samples/{series_key}_{rep_id:06d}.csv"
272 # Skip if dataframe file already exists.
273 if check_key(bucket, dataframe_key):
274 print(f"Dataframe [ { dataframe_key } ] already exists. Skipping.")
275 continue
277 print(f"Parsing data for [ {condition_key} ] replicate [ {rep_ix} ]")
279 data = parse_readdy_simulation_single_fiber_trajectory(
280 bucket,
281 series_name,
282 series_key,
283 rep_ix,
284 n_timepoints,
285 n_monomer_points,
286 total_steps[condition_key],
287 temp_path,
288 )
290 save_dataframe(bucket, dataframe_key, data, index=False)