1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
(* SPDX-License-Identifier: MIT *)
(* Copyright (C) 2023-2026 formalsec *)
(* Written by the Smtml programmers *)
open Cmdliner
open Term.Syntax
open Rresult
open Smtml
module IntSet = Set.Make (Int)
let script_name = R.failwith_error_msg (Fpath.of_string "smtzilla.py")
let smtzilla_data_dirpath () =
match Smtml_sites.Sites.data with
| [ dirpath ] -> Fpath.of_string dirpath
| _ ->
Fmt.error_msg
"Expected one directory path in Smtzilla_utils.Sites.data, instead got: \
%d values"
(List.length Smtml_sites.Sites.data)
let python_script_path () =
let python_script_path =
smtzilla_data_dirpath () >>| fun dirpath -> Fpath.(dirpath // script_name)
in
let res =
python_script_path >>= fun script_path ->
Bos.OS.File.exists script_path >>= fun exists ->
if exists then Ok script_path
else
Fmt.error_msg "The python script file does not exist in: %a" Fpath.pp
script_path
in
R.failwith_error_msg res
let parse_file s =
Fpath.of_string s >>= fun path ->
Bos.OS.File.exists path >>= fun b ->
if b then Ok path
else Fmt.error_msg "The file '%a' does not exist" Fpath.pp path
let file_exists_conv = Arg.conv (parse_file, Fpath.pp)
let csv_file_exists_conv =
let parse_csv_file s =
Fpath.of_string s >>= fun path ->
if Fpath.has_ext "csv" path then Ok path
else Fmt.error_msg "File '%a' is not a CSV file" Fpath.pp path
in
Arg.conv (parse_csv_file, Fpath.pp)
let dir_exists_conv =
let parse s =
Fpath.of_string s >>= fun path ->
Bos.OS.Dir.exists path >>= fun b ->
if b then Ok path
else Fmt.error_msg "The directory '%a' does not exist" Fpath.pp path
in
Arg.conv (parse, Fpath.pp)
let existing_parent_dir_conv =
let parse s =
Fpath.of_string s >>= fun path ->
let dir, _ = Fpath.split_base path in
Bos.OS.Dir.exists dir >>= fun b ->
if b then Ok path
else Fmt.error_msg "No parent directory for '%a'" Fpath.pp path
in
Arg.conv (parse, Fpath.pp)
let debug =
let doc = "Print debugging messages." in
Arg.(value & flag & info [ "debug" ] ~doc)
type predictor =
| GradientBoost
| DecisionTree
let predictor_conv =
let parse s =
match String.lowercase_ascii s with
| "gradient-boost" | "gb" -> Ok GradientBoost
| "decision-tree" | "dt" -> Ok DecisionTree
| _ -> Error (`Msg (Printf.sprintf "Unknown model type: %s" s))
in
let print ppf = function
| GradientBoost -> Format.fprintf ppf "gradient-boost"
| DecisionTree -> Format.fprintf ppf "decision-tree"
in
Arg.conv (parse, print)
let logic_conv = Arg.conv (Logic.of_string, Logic.pp)
let pos_int =
let parser x =
match int_of_string_opt x with
| Some i when i > 0 -> Ok i
| None | Some _ -> Fmt.error_msg "Expected a positive integer"
in
Arg.conv (parser, Format.pp_print_int)
let n_estimators =
let doc =
"Number of estimators (must be > 0, used only for gradient-boost)."
in
Arg.(value & opt pos_int 5 & info [ "n-estimators" ] ~doc)
let max_depth =
let doc = "Maximum depth of trees (must be > 0)." in
Arg.(value & opt pos_int 5 & info [ "max-depth" ] ~doc)
let gradient_boost =
let doc =
"Predictor kind, either a gradient boosting regressor (gradient-boost or \
gb), which is the default, or a decision tree regressor (decision-tree or \
dt)."
in
Arg.(value & opt predictor_conv GradientBoost & info [ "predictor" ] ~doc)
let pp_stats =
let doc = "Print statistics on queries." in
Arg.(value & flag & info [ "pp-stats" ] ~doc)
let run_simulation =
let doc = "Run simulation on trained model." in
Arg.(value & flag & info [ "simulation" ] ~doc)
let marshalled_file =
let doc = "Path to the file containing marshalled queries." in
Arg.(
required & pos 0 (some file_exists_conv) None (info [] ~doc ~docv:"INPUT") )
let dest_dir =
let doc = "Path to a directory in which to store the exported queries." in
Arg.(required & pos 1 (some dir_exists_conv) None (info [] ~doc ~docv:"DIR"))
let output_csv =
let doc =
"Path to the output csv file in which to store the query features."
in
Arg.(
required
& pos 1 (some existing_parent_dir_conv) None (info [] ~doc ~docv:"CSV") )
let input_csv =
let doc =
"Path to the csv file that holds the query features (produced with the \
`extract-features` command)."
in
Arg.(
required & pos 0 (some csv_file_exists_conv) None (info [] ~doc ~docv:"CSV") )
let output_json p =
let doc = "Path to the JSON file to which the model will be exported." in
Arg.(
required
& pos p (some existing_parent_dir_conv) None (info [] ~doc ~docv:"JSON") )
let logic =
let doc = "Logic to use in the SMT files of the extracted queries." in
Arg.(value & opt (some logic_conv) None & info [ "logic" ] ~doc)
let set_debug debug =
if debug then Logs.Src.set_level Smtml.Log.src (Some Logs.Debug);
Logs.set_reporter @@ Logs.format_reporter ()
let rec is_trivial_assert e =
match Expr.view e with
| Val _ | Symbol _ -> true
| Relop (_, _, { node = Symbol _ | Val _; _ }, { node = Symbol _ | Val _; _ })
| Binop (_, _, { node = Symbol _ | Val _; _ }, { node = Symbol _ | Val _; _ })
->
true
| Binop (_, _, e1, e2) | Relop (_, _, e1, e2) ->
is_trivial_assert e1 || is_trivial_assert e2
| Cvtop (_, _, e) | Unop (_, _, e) -> is_trivial_assert e
| _ -> false
(* TODO: Ideally, trivial queries should not arrive to the solver at all, but
doing so properly, while taking into account assertions added with `add` would
require more effort. *)
let are_trivial = function
| [] -> true
| [ a ] when is_trivial_assert a ->
(* When there is only one assert, which is a relop between a symbol and a
const, or a unop on a symbol or const, the query is considered as trivial.
*)
true
| l when List.length l < 10 && List.for_all is_trivial_assert l ->
(* When there are less than 10 asserts (totally arbitrary) and all asserts
are trivial *)
true
| _l -> false
(* type annotation because otherwise the typechecker thinks ?status is ~status *)
let rec extract_queries
(smt2pp :
?name:string -> ?status:[ `Sat | `Unknown | `Unsat ] -> Expr.t list Fmt.t )
destdir seen cnt l =
match l with
| [] -> Ok (seen, cnt)
| (_, assertions, _, time, _) :: t
when time <= 10_000_000L || are_trivial assertions ->
(* Time limit to filter out simple tests that are solver in less than
0.01 seconds *)
extract_queries smt2pp destdir seen cnt t
| (_, assertions, _, _, status) :: t ->
(* TODO: do better than Hashtbl.hash *)
let hash = Hashtbl.hash assertions in
if IntSet.mem hash seen then extract_queries smt2pp destdir seen cnt t
else
let file_path = Fpath.(destdir / Fmt.str "query.%d.smt2" cnt) in
Bos.OS.File.writef file_path "%a" (smt2pp ?name:None ~status) assertions
>>= fun _ ->
extract_queries smt2pp destdir (IntSet.add hash seen) (cnt + 1) t
let rec queries_from_ic smt2pp destdir seen cnt ic =
let queries :
(string * Expr.t list * bool * int64 * [ `Sat | `Unsat | `Unknown ]) list =
Marshal.from_channel ic
in
extract_queries smt2pp destdir seen cnt queries >>= fun (seen, cnt) ->
queries_from_ic smt2pp destdir seen cnt ic
let extract_queries ?logic (path : Fpath.t) (destdir : Fpath.t) =
let (module M) = Solver_type.to_mappings Solver_type.Z3_solver in
if not M.is_available then
Fmt.failwith "Query extraction to smt file depends on Z3";
let smt2pp = M.Smtlib.pp ?logic in
let res =
Bos.OS.File.with_ic path
(fun ic seen ->
try queries_from_ic smt2pp destdir seen 1 ic >>| fun _ -> ()
with End_of_file ->
Log.debug (fun k -> k "Finished extracting queries@.");
Ok () )
IntSet.empty
in
match Rresult.R.join res with
| Ok () -> ()
| Error (`Msg msg) ->
Fmt.failwith "Failed to extract queries from %a\nBecause of: %s" Fpath.pp
path msg
let extract_queries_cmd =
let extract_info =
let doc =
"Given a file containing marshalled smtml queries, extracts the (unique) \
queries into files in a given folder. The files are named \
`query.n.smt2` where `n` is the query's number."
in
Cmd.info "extract-queries" ~doc
in
let extract =
let+ marshalled_file
and+ dest_dir
and+ logic in
extract_queries ?logic marshalled_file dest_dir
in
Cmd.v extract_info extract
let run_regression ~debug ~gradient_boost ~n_estimators ~max_depth ~pp_stats
~run_simulation ~input_csv ~output_json =
let debug = Bos.Cmd.(if debug then v "--debug" else empty) in
let gradient_boost =
Bos.Cmd.(
match gradient_boost with
| GradientBoost -> v "--gradient-boost"
| DecisionTree -> v "--no-gradient-boost" )
in
let n_estimators =
Bos.Cmd.(v "--n-estimators" % string_of_int n_estimators)
in
let max_depth = Bos.Cmd.(v "--max-depth" % string_of_int max_depth) in
let pp_stats = Bos.Cmd.(if pp_stats then v "--pp-stats" else empty) in
let run_simulation =
Bos.Cmd.(if run_simulation then v "--simulation" else empty)
in
let py_script_path = python_script_path () in
let cmd =
Bos.Cmd.(
v "python3" % p py_script_path %% debug %% gradient_boost %% n_estimators
%% max_depth %% pp_stats %% run_simulation % p input_csv % p output_json )
in
Smtml.Log.debug (fun k -> k "Running: %a@." Bos.Cmd.pp cmd);
match Bos.OS.Cmd.run cmd with
| Ok () -> ()
| Error (`Msg msg) ->
Smtml.Log.err (fun k ->
k
"SMTZilla: Running the python script failed: \n\
%s\n\
If the error is a python error, make sure that you have installed the \
necessary python packages to run the script located at: %a.@."
msg Fpath.pp py_script_path );
Fmt.failwith "Python script failure"
let extract_features_cmd =
let extract_info =
let doc =
"Given a file containing marshalled smtml queries, extracts features \
from those queries which are then used to create the decision tree."
in
Cmd.info "extract-features" ~doc
in
let extract =
let+ debug
and+ marshalled_file
and+ output_csv in
set_debug debug;
Smtml.Feature_extraction.cmd marshalled_file output_csv
in
Cmd.v extract_info extract
let regression_cmd =
let regression_info =
let doc =
"Given a CSV file with query features train the regression model"
in
Cmd.info "regression" ~doc
in
let regression =
let+ debug
and+ gradient_boost
and+ n_estimators
and+ max_depth
and+ pp_stats
and+ run_simulation
and+ input_csv
and+ output_json = output_json 1 in
set_debug debug;
run_regression ~debug ~gradient_boost ~n_estimators ~max_depth ~pp_stats
~run_simulation ~output_json ~input_csv
in
Cmd.v regression_info regression
let train_cmd =
let train_info =
let doc =
"Given a marshalled file of queries, generate a CSV file with query \
features, and train the regression model on it"
in
Cmd.info "train" ~doc
in
let train =
let+ debug
and+ gradient_boost
and+ n_estimators
and+ max_depth
and+ pp_stats
and+ run_simulation
and+ marshalled_file
and+ output_csv
and+ output_json = output_json 2 in
set_debug debug;
Smtml.Feature_extraction.cmd marshalled_file output_csv;
run_regression ~debug ~gradient_boost ~n_estimators ~max_depth ~pp_stats
~run_simulation ~output_json ~input_csv:output_csv
in
Cmd.v train_info train