Coverage for subcell_pipeline/simulation/readdy/parser.py: 0%

79 statements  

« prev     ^ index     » next       coverage.py v7.5.3, created at 2024-08-29 15:14 +0000

1"""Methods for parsing ReaDDy simulations.""" 

2 

3import os 

4from math import floor, log10 

5from typing import Optional, Union 

6 

7import boto3 

8import numpy as np 

9import pandas as pd 

10from botocore.exceptions import ClientError 

11from io_collection.keys.check_key import check_key 

12from io_collection.save.save_dataframe import save_dataframe 

13 

14from subcell_pipeline.simulation.readdy.loader import ReaddyLoader 

15from subcell_pipeline.simulation.readdy.post_processor import ReaddyPostProcessor 

16 

17COLUMN_NAMES: list[str] = [ 

18 "fiber_id", 

19 "xpos", 

20 "ypos", 

21 "zpos", 

22 "xforce", 

23 "yforce", 

24 "zforce", 

25 "segment_curvature", 

26 "time", 

27 "fiber_point", 

28] 

29"""Parsed tidy data column names.""" 

30 

31COLUMN_DTYPES: dict[str, Union[type[float], type[int]]] = { 

32 "fiber_id": int, 

33 "xpos": float, 

34 "ypos": float, 

35 "zpos": float, 

36 "xforce": float, 

37 "yforce": float, 

38 "zforce": float, 

39 "segment_curvature": float, 

40 "time": float, 

41 "fiber_point": int, 

42} 

43"""Parsed tidy data column data types.""" 

44 

45READDY_TIMESTEP: float = 0.1 

46"""Simulation timestep (in ns).""" 

47 

48BOX_SIZE: np.ndarray = np.array(3 * [600.0]) 

49"""Default simulation volume dimensions (x, y, z).""" 

50 

51COMPRESSION_DISTANCE: float = 150.0 

52"""Total distance the fiber end was displaced in nm.""" 

53 

54 

55def _download_s3_file(bucket: str, key: str, dest_path: str) -> Optional[str]: 

56 """ 

57 Download file from S3 to local path. 

58 

59 Parameters 

60 ---------- 

61 bucket 

62 Name of S3 bucket. 

63 key 

64 Source key. 

65 dest_path 

66 Target local path. 

67 """ 

68 

69 s3_client = boto3.client("s3") 

70 

71 if os.path.isfile(dest_path): 

72 return dest_path 

73 try: 

74 s3_client.download_file(bucket, key, dest_path) 

75 print(f"Downloaded [ {key} ] to [ {dest_path} ].") 

76 return dest_path 

77 except ClientError: 

78 print(f"!!! Failed to download {key}") 

79 return None 

80 

81 

82def download_readdy_hdf5( 

83 bucket: str, 

84 series_name: str, 

85 series_key: str, 

86 rep_ix: int, 

87 download_path: str, 

88) -> Optional[str]: 

89 """ 

90 Download ReaDDy h5 files from S3 to local path. 

91 

92 The ReaDDy Python package currently requires a local file path. 

93 

94 Parameters 

95 ---------- 

96 bucket 

97 Name of S3 bucket for input and output files. 

98 series_name 

99 Name of simulation series. 

100 series_key 

101 Combination of series and condition names. 

102 rep_ix 

103 Replicate index. 

104 download_path 

105 Path for downloading temporary h5 files. 

106 """ 

107 

108 if bucket.startswith("s3://"): 

109 bucket = bucket.replace("s3://", "") 

110 

111 aws_h5_key = f"{series_name}/outputs/{series_key}_{rep_ix}.h5" 

112 local_h5_path = os.path.join(download_path, f"{series_key}_{rep_ix}.h5") 

113 return _download_s3_file(bucket, aws_h5_key, local_h5_path) 

114 

115 

116def parse_readdy_simulation_single_fiber_trajectory( 

117 bucket: str, 

118 series_name: str, 

119 series_key: str, 

120 rep_ix: int, 

121 n_timepoints: int, 

122 n_monomer_points: int, 

123 total_steps: int, 

124 temp_path: str, 

125 timestep: float = READDY_TIMESTEP, 

126) -> pd.DataFrame: 

127 """ 

128 Parse ReaDDy trajectory data into tidy data format. 

129 

130 Note that this methods assumes there is only one fiber in the simulation. 

131 

132 Parameters 

133 ---------- 

134 bucket 

135 Name of S3 bucket for input and output files. 

136 series_name 

137 Name of simulation. 

138 series_key 

139 Series key. 

140 rep_ix 

141 Replicate index. 

142 n_timepoints 

143 Number of equally spaced timepoints to sample. 

144 n_monomer_points 

145 Number of equally spaced monomer points to sample. 

146 total_steps 

147 Total number of steps for each given simulation. 

148 temp_path 

149 Path for saving temporary h5 files. 

150 timestep 

151 Simulation timestep (in ns). 

152 """ 

153 

154 h5_file_path = download_readdy_hdf5( 

155 bucket, series_name, series_key, rep_ix, temp_path 

156 ) 

157 

158 assert isinstance(h5_file_path, str) 

159 

160 rep_id = rep_ix + 1 

161 pickle_key = f"{series_name}/data/{series_key}_{rep_id:06d}.pkl" 

162 time_inc = total_steps // n_timepoints 

163 

164 readdy_loader = ReaddyLoader( 

165 h5_file_path=h5_file_path, 

166 time_inc=time_inc, 

167 timestep=timestep, 

168 pickle_location=bucket, 

169 pickle_key=pickle_key, 

170 ) 

171 

172 post_processor = ReaddyPostProcessor(readdy_loader.trajectory(), box_size=BOX_SIZE) 

173 

174 times = post_processor.times() 

175 fiber_chain_ids = post_processor.linear_fiber_chain_ids(polymer_number_range=5) 

176 axis_positions, _ = post_processor.linear_fiber_axis_positions(fiber_chain_ids) 

177 

178 fiber_points = post_processor.linear_fiber_control_points( 

179 axis_positions=axis_positions, 

180 n_points=n_monomer_points, 

181 ) 

182 

183 point_data: list[list[Union[str, int, float]]] = [] 

184 for time_ix in range(len(fiber_points)): 

185 for pos_ix in range(fiber_points[0][0].shape[0]): 

186 point_data.append( 

187 [ 

188 1, # fiber_id 

189 fiber_points[time_ix][0][pos_ix][0], # xpos 

190 fiber_points[time_ix][0][pos_ix][1], # ypos 

191 fiber_points[time_ix][0][pos_ix][2], # zpos 

192 0.0, # xforce 

193 0.0, # yforce 

194 0.0, # zforce 

195 0.0, # segment_curvature 

196 times[time_ix], # time 

197 pos_ix, # fiber_point 

198 ] 

199 ) 

200 

201 # Combine all data into dataframe and update data types. 

202 dataframe = pd.DataFrame(point_data, columns=COLUMN_NAMES) 

203 dataframe = dataframe.astype(dtype=COLUMN_DTYPES) 

204 

205 # Add placeholders for features not calculated in ReaDDy 

206 dataframe["force_magnitude"] = np.array(len(point_data) * [0.0]) 

207 dataframe["segment_energy"] = np.array(len(point_data) * [0.0]) 

208 

209 return dataframe 

210 

211 

212def _round_2_sig_figs(x: float) -> int: 

213 """Round value to two sigfigs.""" 

214 return int(round(x, -int(floor(log10(abs(0.1 * x)))))) 

215 

216 

217def _velocity_for_cond(condition_key: str) -> float: 

218 """Convert `NNNN` condition key to `NNN.N` velocity.""" 

219 return float(condition_key[:3] + "." + condition_key[-1]) 

220 

221 

222def parse_readdy_simulation_data( 

223 bucket: str, 

224 series_name: str, 

225 condition_keys: list[str], 

226 n_replicates: int, 

227 n_timepoints: int, 

228 n_monomer_points: int, 

229 compression: bool, 

230 temp_path: str, 

231) -> None: 

232 """ 

233 Parse ReaDDy simulation data for select conditions and replicates. 

234 

235 Parameters 

236 ---------- 

237 bucket 

238 Name of S3 bucket for input and output files. 

239 series_name 

240 Name of simulation series. 

241 condition_keys 

242 List of condition keys. 

243 n_replicates 

244 Number of simulation replicates. 

245 n_timepoints 

246 Number of equally spaced timepoints to sample. 

247 n_monomer_points 

248 Number of equally spaced monomer points to sample. 

249 compression 

250 True if simulations are compressed, False otherwise. 

251 temp_path 

252 Path for saving temporary h5 files. 

253 """ 

254 total_steps: dict[str, int] = {} 

255 if compression: 

256 total_steps = { 

257 cond: _round_2_sig_figs( 

258 (COMPRESSION_DISTANCE * 1e-3 / _velocity_for_cond(cond)) * 1e10 

259 ) 

260 for cond in condition_keys 

261 } 

262 else: 

263 total_steps = {"": int(1e7)} 

264 

265 for condition_key in condition_keys: 

266 series_key = f"{series_name}_{condition_key}" if condition_key else series_name 

267 

268 for rep_ix in range(n_replicates): 

269 rep_id = rep_ix + 1 

270 dataframe_key = f"{series_name}/samples/{series_key}_{rep_id:06d}.csv" 

271 

272 # Skip if dataframe file already exists. 

273 if check_key(bucket, dataframe_key): 

274 print(f"Dataframe [ { dataframe_key } ] already exists. Skipping.") 

275 continue 

276 

277 print(f"Parsing data for [ {condition_key} ] replicate [ {rep_ix} ]") 

278 

279 data = parse_readdy_simulation_single_fiber_trajectory( 

280 bucket, 

281 series_name, 

282 series_key, 

283 rep_ix, 

284 n_timepoints, 

285 n_monomer_points, 

286 total_steps[condition_key], 

287 temp_path, 

288 ) 

289 

290 save_dataframe(bucket, dataframe_key, data, index=False)