Slicing optimization for line/region indentation feature for emacs

ocaml-ppx · Jan 18, 2022 · cf065a3 · cf065a3
1 parent 0d66622
commit cf065a3
Show file tree

Hide file tree

Showing 13 changed files with 1,209 additions and 8 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -18,6 +18,8 @@
 
   + Print odoc code block delimiters on their own line (#1980, @gpetiot)
 
+  + Optimize line/region re-indentation feature in emacs (#1609, @gpetiot)
+
 #### New features
 
 #### RPC

diff --git a/lib/Cmt_lexer.ml b/lib/Cmt_lexer.ml
@@ -0,0 +1,75 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                              OCamlFormat                               *)
+(*                                                                        *)
+(*            Copyright (c) Facebook, Inc. and its affiliates.            *)
+(*                                                                        *)
+(*      This source code is licensed under the MIT license found in       *)
+(*      the LICENSE file in the root directory of this source tree.       *)
+(*                                                                        *)
+(**************************************************************************)
+
+type token = Cmt of string Location.loc | S of string Location.loc
+
+open Location
+
+let inc_pos (pos : Lexing.position) = {pos with pos_cnum= pos.pos_cnum + 1}
+
+let nl_pos (pos : Lexing.position) =
+  { pos with
+    pos_lnum= pos.pos_lnum + 1
+  ; pos_bol= pos.pos_cnum + 1
+  ; pos_cnum= pos.pos_cnum + 1 }
+
+let update_pos pos input ~start_i ~end_i =
+  let rec aux pos i =
+    if i < end_i then
+      match String.unsafe_get input i with
+      | '\n' -> aux (nl_pos pos) (i + 1)
+      | _ -> aux (inc_pos pos) (i + 1)
+    else pos
+  in
+  (aux pos start_i, String.sub ~pos:start_i ~len:(end_i - start_i) input)
+
+let loc_ghost = false
+
+let lex_comments input =
+  let pos_fname = !Location.input_name in
+  let rec aux acc (pos : Lexing.position) =
+    if pos.pos_cnum >= String.length input then acc
+    else
+      match String.substr_index ~pos:pos.pos_cnum ~pattern:"(*" input with
+      | Some opn -> (
+          let acc, pos =
+            if opn = pos.pos_cnum then (acc, pos)
+            else
+              let pos', s =
+                update_pos pos input ~start_i:pos.pos_cnum ~end_i:opn
+              in
+              let loc = {loc_ghost; loc_start= pos; loc_end= pos'} in
+              (S (Location.mkloc s loc) :: acc, pos')
+          in
+          match String.substr_index ~pos:opn ~pattern:"*)" input with
+          | Some cls ->
+              let pos', cmt =
+                update_pos pos input ~start_i:opn ~end_i:(cls + 2)
+              in
+              let loc = {loc_ghost; loc_start= pos; loc_end= pos'} in
+              let acc = Cmt (Location.mkloc cmt loc) :: acc in
+              aux acc pos'
+          | None ->
+              let pos', cmt =
+                update_pos pos input ~start_i:opn
+                  ~end_i:(String.length input)
+              in
+              let loc = {loc_ghost; loc_start= pos; loc_end= pos'} in
+              Cmt (Location.mkloc cmt loc) :: acc )
+      | None ->
+          let pos', s =
+            update_pos pos input ~start_i:pos.pos_cnum
+              ~end_i:(String.length input)
+          in
+          let loc = {loc_ghost; loc_start= pos; loc_end= pos'} in
+          S (Location.mkloc s loc) :: acc
+  in
+  List.rev (aux [] {pos_fname; pos_lnum= 0; pos_bol= 0; pos_cnum= 0})
diff --git a/lib/Cmt_lexer.mli b/lib/Cmt_lexer.mli
@@ -0,0 +1,16 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                              OCamlFormat                               *)
+(*                                                                        *)
+(*            Copyright (c) Facebook, Inc. and its affiliates.            *)
+(*                                                                        *)
+(*      This source code is licensed under the MIT license found in       *)
+(*      the LICENSE file in the root directory of this source tree.       *)
+(*                                                                        *)
+(**************************************************************************)
+
+type token = Cmt of string Location.loc | S of string Location.loc
+
+val lex_comments : string -> token list
+(** [lex_comments x] splits [x] into a sequence of comments and non-comments
+    strings. *)
diff --git a/lib/Migrate_ast.ml b/lib/Migrate_ast.ml
@@ -32,6 +32,11 @@ module Position = struct
   include (val Comparator.make ~compare ~sexp_of_t)
 
   let distance p1 p2 = p2.pos_cnum - p1.pos_cnum
+
+  let pp fs {pos_fname; pos_lnum; pos_bol; pos_cnum} =
+    Format.fprintf fs
+      "@[<2>{ pos_fname= %S;@;pos_lnum= %i;@;pos_bol= %i;@;pos_cnum= %i }@]"
+      pos_fname pos_lnum pos_bol pos_cnum
 end
 
 module Location = struct
@@ -94,6 +99,14 @@ module Location = struct
     let min a b = if width a < width b then a else b in
     List.reduce_exn (loc :: stack) ~f:min
 
+  let pp fs {loc_ghost; loc_start; loc_end} =
+    Format.fprintf fs
+      "@[<2>{ loc_ghost= %b;@;loc_start= %a;@;loc_end= %a }@]" loc_ghost
+      Position.pp loc_start Position.pp loc_end
+
+  let pp_loc f fs {txt; loc} =
+    Format.fprintf fs "@[<2>{ txt= %a;@;loc= %a }@]" f txt pp loc
+
   let of_lexbuf (lexbuf : Lexing.lexbuf) =
     { loc_start= lexbuf.lex_start_p
     ; loc_end= lexbuf.lex_curr_p

diff --git a/lib/Migrate_ast.mli b/lib/Migrate_ast.mli
@@ -64,6 +64,9 @@ module Location : sig
 
   val is_single_line : t -> int -> bool
 
+  val pp_loc :
+    (Format.formatter -> 'a -> unit) -> Format.formatter -> 'a loc -> unit
+
   val of_lexbuf : Lexing.lexbuf -> t
 
   val print : Format.formatter -> t -> unit

diff --git a/lib/Slicer.ml b/lib/Slicer.ml
@@ -0,0 +1,222 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                              OCamlFormat                               *)
+(*                                                                        *)
+(*            Copyright (c) Facebook, Inc. and its affiliates.            *)
+(*                                                                        *)
+(*      This source code is licensed under the MIT license found in       *)
+(*      the LICENSE file in the root directory of this source tree.       *)
+(*                                                                        *)
+(**************************************************************************)
+
+let find_semisemi_start ?pos txt =
+  String.substr_index_all ~may_overlap:false ~pattern:";;" txt
+  |> fun l ->
+  ( match pos with
+  | Some pos -> List.filter l ~f:(fun x -> x <= pos - 2)
+  | None -> l )
+  |> List.last
+
+let find_semisemi_end ?pos txt =
+  String.substr_index_all ~may_overlap:false ~pattern:";;" txt
+  |> fun l ->
+  ( match pos with
+  | Some pos -> List.filter l ~f:(fun x -> x > pos)
+  | None -> l )
+  |> List.hd
+
+let find_nl_start ?pos txt =
+  let followed_by_space x =
+    match String.get txt (x + 2) with
+    | exception _ -> true
+    | ' ' -> false
+    | _ -> true
+  in
+  String.substr_index_all ~may_overlap:false ~pattern:"\n\n" txt
+  |> fun l ->
+  ( match pos with
+  | Some pos ->
+      List.filter l ~f:(fun x -> x <= pos - 2 && followed_by_space x)
+  | None -> List.filter l ~f:followed_by_space )
+  |> List.last
+
+let find_nl_end ?pos txt =
+  let followed_by_space x =
+    match String.get txt (x + 2) with
+    | exception _ -> true
+    | ' ' -> false
+    | _ -> true
+  in
+  String.substr_index_all ~may_overlap:false ~pattern:"\n\n" txt
+  |> fun l ->
+  ( match pos with
+  | Some pos -> List.filter l ~f:(fun x -> x > pos && followed_by_space x)
+  | None -> List.filter l ~f:followed_by_space )
+  |> List.hd
+
+let sub_start i (loc : Location.t) =
+  String.sub ~pos:(i + 2)
+    ~len:(loc.loc_end.pos_cnum - loc.loc_start.pos_cnum - i - 2)
+
+let sub_end i = String.sub ~pos:0 ~len:(i - 1)
+
+let find_start ?pos (lexed : Cmt_lexer.token array) ~start_i
+    ~split_on_semisemi =
+  let rec aux ?pos i =
+    if i < 0 then
+      match Array.unsafe_get lexed 0 with
+      | Cmt {txt; _} -> (0, txt, 0, 0)
+      | S {txt; _} -> (0, txt, 0, 0)
+    else
+      let lex = Array.unsafe_get lexed i in
+      match lex with
+      | Cmt _ -> aux (i - 1)
+      | S {txt; loc} -> (
+          let ldiff max_cnum =
+            let rec aux lnum i =
+              if i >= max_cnum then lnum
+              else
+                match String.unsafe_get txt i with
+                | '\n' -> aux (lnum + 1) (i + 1)
+                | _ -> aux lnum (i + 1)
+            in
+            aux loc.loc_start.pos_lnum 0
+          in
+          if split_on_semisemi then
+            match find_semisemi_start ?pos txt with
+            | Some x -> (
+              match find_nl_start ?pos txt with
+              | Some y ->
+                  let x = max x y in
+                  let cnum = x + 2 in
+                  let ldiff = ldiff cnum in
+                  (i, sub_start x loc txt, cnum, ldiff)
+              | None ->
+                  let cnum = x + 2 in
+                  let ldiff = ldiff cnum in
+                  (i, sub_start x loc txt, cnum, ldiff) )
+            | None -> (
+              match find_nl_start ?pos txt with
+              | Some x ->
+                  let cnum = x + 2 in
+                  let ldiff = ldiff cnum in
+                  (i, sub_start x loc txt, cnum, ldiff)
+              | None -> aux (i - 1) )
+          else
+            match find_nl_start ?pos txt with
+            | Some x ->
+                let cnum = x + 2 in
+                let ldiff = ldiff cnum in
+                (i, sub_start x loc txt, cnum, ldiff)
+            | None -> aux (i - 1) )
+  in
+  aux ?pos start_i
+
+let find_end ?pos (lexed : Cmt_lexer.token array) ~end_i ~split_on_semisemi =
+  let max_i = Array.length lexed - 1 in
+  let rec aux ?pos i =
+    if i > max_i then
+      match Array.unsafe_get lexed max_i with
+      | Cmt {txt; _} -> (max_i, txt, String.length txt - 1)
+      | S {txt; _} -> (max_i, txt, String.length txt - 1)
+    else
+      let lex = Array.unsafe_get lexed i in
+      match lex with
+      | Cmt _ -> aux (i + 1)
+      | S {txt; loc= _} -> (
+          if split_on_semisemi then
+            match find_semisemi_end ?pos txt with
+            | Some x -> (
+              match find_nl_end ?pos txt with
+              | Some y ->
+                  let x = min x y in
+                  (i, sub_end x txt, x - 1)
+              | None -> (i, sub_end x txt, x - 1) )
+            | None -> (
+              match find_nl_end ?pos txt with
+              | Some x -> (i, sub_end x txt, x - 1)
+              | None -> aux (i + 1) )
+          else
+            match find_nl_end ?pos txt with
+            | Some x -> (i, sub_end x txt, x - 1)
+            | None -> aux (i + 1) )
+  in
+  aux ?pos end_i
+
+let split ~range:((low, high) as range) ~split_on_semisemi = function
+  | [] -> ("", range)
+  | lexed ->
+      let lexed : Cmt_lexer.token array = Array.of_list lexed in
+      let high =
+        match Array.unsafe_get lexed (Array.length lexed - 1) with
+        | Cmt {loc; _} | S {loc; _} ->
+            if high - 1 = loc.loc_end.pos_lnum + 1 then high - 1 else high
+      in
+      let range = (low, high) in
+      let start_i, start_lex =
+        Array.findi_exn lexed ~f:(fun _ -> function
+          | Cmt {loc; _} | S {loc; _} ->
+              loc.loc_start.pos_lnum <= low - 1
+              && low - 1 <= loc.loc_end.pos_lnum )
+      in
+      let end_i, end_lex =
+        Array.findi_exn lexed ~f:(fun _ -> function
+          | Cmt {loc; _} | S {loc; _} ->
+              loc.loc_start.pos_lnum <= high - 1
+              && high - 1 <= loc.loc_end.pos_lnum )
+      in
+      let start_i, start_lex, start_pos, ldiff =
+        match start_lex with
+        | S {txt; loc} ->
+            let rec aux ~lnum i =
+              if lnum = low - 1 then i
+              else
+                match String.unsafe_get txt i with
+                | '\n' -> aux ~lnum:(lnum + 1) (i + 1)
+                | _ -> aux ~lnum (i + 1)
+            in
+            let pos = aux 0 ~lnum:loc.loc_start.pos_lnum in
+            find_start ~pos ~start_i ~split_on_semisemi lexed
+        | Cmt _ -> find_start ~start_i ~split_on_semisemi lexed
+      in
+      let end_i, end_lex, end_pos =
+        match end_lex with
+        | S {txt; loc} ->
+            let rec aux ~lnum i =
+              if lnum = high - 1 then i
+              else
+                match String.unsafe_get txt i with
+                | '\n' -> aux ~lnum:(lnum + 1) (i + 1)
+                | _ -> aux ~lnum (i + 1)
+            in
+            let pos = aux 0 ~lnum:loc.loc_start.pos_lnum in
+            find_end ~pos ~end_i ~split_on_semisemi lexed
+        | Cmt _ -> find_end ~end_i ~split_on_semisemi lexed
+      in
+      if start_i = end_i then
+        match Array.unsafe_get lexed start_i with
+        | Cmt {txt; _} -> (txt, range)
+        | S {txt; _} ->
+            ( String.sub txt ~pos:start_pos ~len:(end_pos - start_pos + 1)
+            , (low - ldiff, high - ldiff) )
+      else
+        let rec aux acc i =
+          if i = end_i then acc ^ end_lex
+          else
+            match Array.unsafe_get lexed i with
+            | Cmt {txt; _} -> aux (acc ^ txt) (i + 1)
+            | S {txt; _} -> aux (acc ^ txt) (i + 1)
+        in
+        (aux start_lex (start_i + 1), (low - ldiff, high - ldiff))
+
+let fragment (type a) (fg : a Extended_ast.t) ~range input =
+  let split_on_semisemi =
+    match fg with
+    | Structure | Use_file -> true
+    | Signature -> false
+    | Core_type -> failwith "Slicer.fragment not implemented for Core_type"
+    | Module_type ->
+        failwith "Slicer.fragment not implemented for Module_type"
+    | Expression -> failwith "Slicer.fragment not implemented for Expression"
+  in
+  Cmt_lexer.lex_comments input |> split ~range ~split_on_semisemi
diff --git a/lib/Slicer.mli b/lib/Slicer.mli
@@ -0,0 +1,23 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                              OCamlFormat                               *)
+(*                                                                        *)
+(*            Copyright (c) Facebook, Inc. and its affiliates.            *)
+(*                                                                        *)
+(*      This source code is licensed under the MIT license found in       *)
+(*      the LICENSE file in the root directory of this source tree.       *)
+(*                                                                        *)
+(**************************************************************************)
+
+val split :
+     range:int * int
+  -> split_on_semisemi:bool
+  -> Cmt_lexer.token list
+  -> string * (int * int)
+(** Exposed for tests. *)
+
+val fragment :
+  'a Extended_ast.t -> range:int * int -> string -> string * (int * int)
+(** [fragment fg ~range x] slices [x] into a smaller string, filtered
+    according to the range of lines [range], and containing the relevant
+    context so we can infer the indentation to apply to this range of lines. *)