Coverage for subcell_pipeline/simulation/readdy/parser.py: 0%

1"""Methods for parsing ReaDDy simulations."""

3import os

4from math import floor, log10

5from typing import Optional, Union

7import boto3

8import numpy as np

9import pandas as pd

10from botocore.exceptions import ClientError

11from io_collection.keys.check_key import check_key

12from io_collection.save.save_dataframe import save_dataframe

14from subcell_pipeline.simulation.readdy.loader import ReaddyLoader

15from subcell_pipeline.simulation.readdy.post_processor import ReaddyPostProcessor

17COLUMN_NAMES: list[str] = [

18 "fiber_id",

19 "xpos",

20 "ypos",

21 "zpos",

22 "xforce",

23 "yforce",

24 "zforce",

25 "segment_curvature",

26 "time",

27 "fiber_point",

28]

29"""Parsed tidy data column names."""

31COLUMN_DTYPES: dict[str, Union[type[float], type[int]]] = {

32 "fiber_id": int,

33 "xpos": float,

34 "ypos": float,

35 "zpos": float,

36 "xforce": float,

37 "yforce": float,

38 "zforce": float,

39 "segment_curvature": float,

40 "time": float,

41 "fiber_point": int,

42}

43"""Parsed tidy data column data types."""

45READDY_TIMESTEP: float = 0.1

46"""Simulation timestep (in ns)."""

48BOX_SIZE: np.ndarray = np.array(3 * [600.0])

49"""Default simulation volume dimensions (x, y, z)."""

51COMPRESSION_DISTANCE: float = 150.0

52"""Total distance the fiber end was displaced in nm."""

55def _download_s3_file(bucket: str, key: str, dest_path: str) -> Optional[str]:

56 """

57 Download file from S3 to local path.

59 Parameters

60 ----------

61 bucket

62 Name of S3 bucket.

63 key

64 Source key.

65 dest_path

66 Target local path.

67 """

69 s3_client = boto3.client("s3")

71 if os.path.isfile(dest_path):

72 return dest_path

73 try:

74 s3_client.download_file(bucket, key, dest_path)

75 print(f"Downloaded [ {key} ] to [ {dest_path} ].")

76 return dest_path

77 except ClientError:

78 print(f"!!! Failed to download {key}")

79 return None

82def download_readdy_hdf5(

83 bucket: str,

84 series_name: str,

85 series_key: str,

86 rep_ix: int,

87 download_path: str,

88) -> Optional[str]:

89 """

90 Download ReaDDy h5 files from S3 to local path.

92 The ReaDDy Python package currently requires a local file path.

94 Parameters

95 ----------

96 bucket

97 Name of S3 bucket for input and output files.

98 series_name

99 Name of simulation series.

100 series_key

101 Combination of series and condition names.

102 rep_ix

103 Replicate index.

104 download_path

105 Path for downloading temporary h5 files.

106 """

107

108 if bucket.startswith("s3://"):

109 bucket = bucket.replace("s3://", "")

110

111 aws_h5_key = f"{series_name}/outputs/{series_key}_{rep_ix}.h5"

112 local_h5_path = os.path.join(download_path, f"{series_key}_{rep_ix}.h5")

113 return _download_s3_file(bucket, aws_h5_key, local_h5_path)

114

115

116def parse_readdy_simulation_single_fiber_trajectory(

117 bucket: str,

118 series_name: str,

119 series_key: str,

120 rep_ix: int,

121 n_timepoints: int,

122 n_monomer_points: int,

123 total_steps: int,

124 temp_path: str,

125 timestep: float = READDY_TIMESTEP,

126) -> pd.DataFrame:

127 """

128 Parse ReaDDy trajectory data into tidy data format.

129

130 Note that this methods assumes there is only one fiber in the simulation.

131

132 Parameters

133 ----------

134 bucket

135 Name of S3 bucket for input and output files.

136 series_name

137 Name of simulation.

138 series_key

139 Series key.

140 rep_ix

141 Replicate index.

142 n_timepoints

143 Number of equally spaced timepoints to sample.

144 n_monomer_points

145 Number of equally spaced monomer points to sample.

146 total_steps

147 Total number of steps for each given simulation.

148 temp_path

149 Path for saving temporary h5 files.

150 timestep

151 Simulation timestep (in ns).

152 """

153

154 h5_file_path = download_readdy_hdf5(

155 bucket, series_name, series_key, rep_ix, temp_path

156 )

157

158 assert isinstance(h5_file_path, str)

159

160 rep_id = rep_ix + 1

161 pickle_key = f"{series_name}/data/{series_key}_{rep_id:06d}.pkl"

162 time_inc = total_steps // n_timepoints

163

164 readdy_loader = ReaddyLoader(

165 h5_file_path=h5_file_path,

166 time_inc=time_inc,

167 timestep=timestep,

168 pickle_location=bucket,

169 pickle_key=pickle_key,

170 )

171

172 post_processor = ReaddyPostProcessor(readdy_loader.trajectory(), box_size=BOX_SIZE)

173

174 times = post_processor.times()

175 fiber_chain_ids = post_processor.linear_fiber_chain_ids(polymer_number_range=5)

176 axis_positions, _ = post_processor.linear_fiber_axis_positions(fiber_chain_ids)

177

178 fiber_points = post_processor.linear_fiber_control_points(

179 axis_positions=axis_positions,

180 n_points=n_monomer_points,

181 )

182

183 point_data: list[list[Union[str, int, float]]] = []

184 for time_ix in range(len(fiber_points)):

185 for pos_ix in range(fiber_points[0][0].shape[0]):

186 point_data.append(

187 [

188 1, # fiber_id

189 fiber_points[time_ix][0][pos_ix][0], # xpos

190 fiber_points[time_ix][0][pos_ix][1], # ypos

191 fiber_points[time_ix][0][pos_ix][2], # zpos

192 0.0, # xforce

193 0.0, # yforce

194 0.0, # zforce

195 0.0, # segment_curvature

196 times[time_ix], # time

197 pos_ix, # fiber_point

198 ]

199 )

200

201 # Combine all data into dataframe and update data types.

202 dataframe = pd.DataFrame(point_data, columns=COLUMN_NAMES)

203 dataframe = dataframe.astype(dtype=COLUMN_DTYPES)

204

205 # Add placeholders for features not calculated in ReaDDy

206 dataframe["force_magnitude"] = np.array(len(point_data) * [0.0])

207 dataframe["segment_energy"] = np.array(len(point_data) * [0.0])

208

209 return dataframe

210

211

212def _round_2_sig_figs(x: float) -> int:

213 """Round value to two sigfigs."""

214 return int(round(x, -int(floor(log10(abs(0.1 * x))))))

215

216

217def _velocity_for_cond(condition_key: str) -> float:

218 """Convert `NNNN` condition key to `NNN.N` velocity."""

219 return float(condition_key[:3] + "." + condition_key[-1])

220

221

222def parse_readdy_simulation_data(

223 bucket: str,

224 series_name: str,

225 condition_keys: list[str],

226 n_replicates: int,

227 n_timepoints: int,

228 n_monomer_points: int,

229 compression: bool,

230 temp_path: str,

231) -> None:

232 """

233 Parse ReaDDy simulation data for select conditions and replicates.

234

235 Parameters

236 ----------

237 bucket

238 Name of S3 bucket for input and output files.

239 series_name

240 Name of simulation series.

241 condition_keys

242 List of condition keys.

243 n_replicates

244 Number of simulation replicates.

245 n_timepoints

246 Number of equally spaced timepoints to sample.

247 n_monomer_points

248 Number of equally spaced monomer points to sample.

249 compression

250 True if simulations are compressed, False otherwise.

251 temp_path

252 Path for saving temporary h5 files.

253 """

254 total_steps: dict[str, int] = {}

255 if compression:

256 total_steps = {

257 cond: _round_2_sig_figs(

258 (COMPRESSION_DISTANCE * 1e-3 / _velocity_for_cond(cond)) * 1e10

259 )

260 for cond in condition_keys

261 }

262 else:

263 total_steps = {"": int(1e7)}

264

265 for condition_key in condition_keys:

266 series_key = f"{series_name}_{condition_key}" if condition_key else series_name

267

268 for rep_ix in range(n_replicates):

269 rep_id = rep_ix + 1

270 dataframe_key = f"{series_name}/samples/{series_key}_{rep_id:06d}.csv"

271

272 # Skip if dataframe file already exists.

273 if check_key(bucket, dataframe_key):

274 print(f"Dataframe [ { dataframe_key } ] already exists. Skipping.")

275 continue

276

277 print(f"Parsing data for [ {condition_key} ] replicate [ {rep_ix} ]")

278

279 data = parse_readdy_simulation_single_fiber_trajectory(

280 bucket,

281 series_name,

282 series_key,

283 rep_ix,

284 n_timepoints,

285 n_monomer_points,

286 total_steps[condition_key],

287 temp_path,

288 )

289

290 save_dataframe(bucket, dataframe_key, data, index=False)