From 11fd74e17f4ef619ca05fae23925bc90b7560d31 Mon Sep 17 00:00:00 2001 From: Chris Penner Date: Wed, 8 May 2024 11:24:23 -0700 Subject: [PATCH 01/50] Add FromJSON for Display objects --- unison-share-api/src/Unison/Server/Orphans.hs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/unison-share-api/src/Unison/Server/Orphans.hs b/unison-share-api/src/Unison/Server/Orphans.hs index 8aeadc269b..8775c376cd 100644 --- a/unison-share-api/src/Unison/Server/Orphans.hs +++ b/unison-share-api/src/Unison/Server/Orphans.hs @@ -155,6 +155,15 @@ instance (ToJSON b, ToJSON a) => ToJSON (DisplayObject b a) where MissingObject sh -> object ["tag" Aeson..= String "MissingObject", "contents" Aeson..= sh] UserObject a -> object ["tag" Aeson..= String "UserObject", "contents" Aeson..= a] +instance (FromJSON a, FromJSON b) => FromJSON (DisplayObject b a) where + parseJSON = withObject "DisplayObject" \o -> do + tag <- o .: "tag" + case tag of + "BuiltinObject" -> BuiltinObject <$> o .: "contents" + "MissingObject" -> MissingObject <$> o .: "contents" + "UserObject" -> UserObject <$> o .: "contents" + _ -> fail $ "Invalid tag: " <> Text.unpack tag + deriving instance (ToSchema b, ToSchema a) => ToSchema (DisplayObject b a) -- [21/10/07] Hello, this is Mitchell. Name refactor in progress. Changing internal representation from a flat text to a From cdab05d25992537ea1ce699a1dbe2d4413810b41 Mon Sep 17 00:00:00 2001 From: ChrisPenner Date: Wed, 17 Jul 2024 18:29:29 +0000 Subject: [PATCH 02/50] automatically run ormolu --- unison-syntax/src/Unison/Syntax/HashQualifiedPrime.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unison-syntax/src/Unison/Syntax/HashQualifiedPrime.hs b/unison-syntax/src/Unison/Syntax/HashQualifiedPrime.hs index 6326006c7a..406a8eae2f 100644 --- a/unison-syntax/src/Unison/Syntax/HashQualifiedPrime.hs +++ b/unison-syntax/src/Unison/Syntax/HashQualifiedPrime.hs @@ -48,7 +48,7 @@ toText = -- | A hash-qualified parser. hashQualifiedP :: - Monad m => + (Monad m) => ParsecT (Token Text) [Char] m name -> ParsecT (Token Text) [Char] m (HQ'.HashQualified name) hashQualifiedP nameP = From b6b31370021099724c7230ac89a84e46155d1f07 Mon Sep 17 00:00:00 2001 From: Eduard Nicodei Date: Sat, 13 Jul 2024 20:15:41 +0100 Subject: [PATCH 03/50] create gitignore in scheme-libs/racket/unison/ --- scheme-libs/racket/unison/.gitignore | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 scheme-libs/racket/unison/.gitignore diff --git a/scheme-libs/racket/unison/.gitignore b/scheme-libs/racket/unison/.gitignore new file mode 100644 index 0000000000..64e9064d19 --- /dev/null +++ b/scheme-libs/racket/unison/.gitignore @@ -0,0 +1,6 @@ +compiled/ +boot-generated.ss +builtin-generated.ss +compound-wrappers.ss +data-info.ss +simple-wrappers.ss From 3cd2a76d5e6744599232bc972128f9376ac0817d Mon Sep 17 00:00:00 2001 From: Eduard Nicodei Date: Sat, 13 Jul 2024 20:54:55 +0100 Subject: [PATCH 04/50] update scheme-libs/racket/unison/Readme.md --- development.markdown | 4 ++ scheme-libs/racket/unison/Readme.md | 64 +++++++++++++++++-- .../transcripts-manual/gen-racket-libs.md | 7 +- 3 files changed, 65 insertions(+), 10 deletions(-) diff --git a/development.markdown b/development.markdown index 962a507c63..fa5d613b84 100644 --- a/development.markdown +++ b/development.markdown @@ -187,3 +187,7 @@ nix develop '.#cabal-unison-parser-typechecker' cd unison-cli cabal run --enable-profiling unison-cli-main:exe:unison -- +RTS -p ``` + +## Native compilation + +See the [readme](scheme-libs/racket/unison/Readme.md). diff --git a/scheme-libs/racket/unison/Readme.md b/scheme-libs/racket/unison/Readme.md index dafd7e3fa8..3984146df9 100644 --- a/scheme-libs/racket/unison/Readme.md +++ b/scheme-libs/racket/unison/Readme.md @@ -1,5 +1,18 @@ This directory contains libraries necessary for building and running -unison programs via Racket Scheme. +unison programs via Racket Scheme. The rough steps are as follows: + +* Build Racket libraries from the current Unison version. +* Build the `unison-runtime` binary. +* Pass the path to `unison-runtime` to `ucm`. + +Native compilation is done via the `compile.native` `ucm` command. +Under-the-hood, Unison does the following: + +* Convert the function to bytecode (similar to how `compile` command works). +* Call `unison-runtime` which will convert the bytecode to a temporary Racket + file. The Racket file is usually placed in your `.cache/unisonlanguage`. +* folder. Call `raco exe file.rkt -o executable` which will create a native + executable from the Racket source code. ## Prerequisites @@ -10,19 +23,56 @@ You'll need to have a couple things installed on your system: * [BLAKE2](https://github.com/BLAKE2/libb2) (you may need to install this manually) -In particular, our crypto functions require on both `libcrypto` (from openssl) and `libb2`. You may have to tell racket where to find `libb2`, by adding an entry to the hash table in your [`config.rktd` file](https://docs.racket-lang.org/raco/config-file.html). This is what I had, for an M1 mac w/ libb2 installed via Homebrew: +In particular, our crypto functions require on both `libcrypto` (from +openssl) and `libb2`. You may have to tell racket where to find `libb2`, +by adding an entry to the hash table in your +[`config.rktd` file](https://docs.racket-lang.org/raco/config-file.html). +This is what I had, for an M1 mac w/ `libb2` installed via Homebrew: ``` -(lib-search-dirs . (#f "/opt/homebrew/Cellar/libb2/0.98.1/lib/")) +$ cat scheme-libs/racket/config/config.rktd +#hash( + (lib-search-dirs . (#f "/opt/homebrew/Cellar/libb2/0.98.1/lib/")) +) ``` You'll also need to install `x509-lib` with `raco pkg install x509-lib` +Finally, some distributions only package `racket-minimal`. You'll need to +install the full compiler suite using `raco pkg install compiler-lib` +([source](https://www.dbrunner.de/blog/2016/01/12/using-racket-minimal-and-raco/)) + +## Building + +First, make sure unison is built (see [development](../../../development.markdown)) + +Next, use unison to generate the racket libraries. These are dependencies for +building `unison-runtime`. +* Read [gen-racket-libs.md](../../../../unison-src-transcripts-manual/gen-racket-libs.md). + It will contain two things: + * `ucm` and `unison` transcripts that generate the libraries + * Instructions on how to build `unison-runtime` using `raco` + +If everything went well you should now have a new executable in `scheme-libs/racket/unison-runtime`. +For example: +``` +$ file scheme-libs/racket/unison-runtime +scheme-libs/racket/unison-runtime: Mach-O 64-bit executable arm64 +``` ## Running the unison test suite -To run the test suite, first `stack build` (or `stack build --fast`), then: +Note that if you set up `config.rktd` above, you'll need to pass the path to its +folder in `PLTCONFIGDIR` before invoking unison or the test scripts: + +``` +export PLTCONFIGDIR=$(pwd)/scheme-libs/racket/config +``` + +If you don't, some of the tests will fail with eg `ffi-lib: could not load foreign library`. + +To run the test suite you can do: ``` -./unison-src/builtin-tests/jit-tests.sh $(stack exec which unison) --runtime-path +./unison-src/builtin-tests/jit-tests.sh $(stack exec which unison) --runtime-path scheme-libs/racket/unison-runtime ``` OR if you want to run the same tests in interpreted mode: @@ -31,7 +81,9 @@ OR if you want to run the same tests in interpreted mode: ./unison-src/builtin-tests/interpreter-tests.sh ``` -The above scripts fetch and cache a copy of base and the scheme-generating libraries, and copy this directory to `$XDG_DATA_DIRECTORY/unisonlanguage/scheme-libs`. +The above scripts fetch and cache a copy of base and the scheme-generating +libraries, and copy this directory to `$XDG_DATA_DIRECTORY/unisonlanguage/scheme-libs`. +Both scripts _should_ pass. ## Iterating more quickly diff --git a/unison-src/transcripts-manual/gen-racket-libs.md b/unison-src/transcripts-manual/gen-racket-libs.md index 178503c969..311d056641 100644 --- a/unison-src/transcripts-manual/gen-racket-libs.md +++ b/unison-src/transcripts-manual/gen-racket-libs.md @@ -20,12 +20,11 @@ complement of unison libraries for a given combination of ucm version and @unison/internal version. To set up racket to use these files, we need to create a package with -them. This is accomplished by running. +them. This is accomplished by running: - raco pkg install -t dir unison + raco pkg install -t dir scheme-libs/racket/unison -in the directory where the `unison` directory is located. Then the -runtime executable can be built with +After, the runtime executable can be built with raco exe scheme-libs/racket/unison-runtime.rkt From 05362b87d065015080c8096508bc7182d02a07c1 Mon Sep 17 00:00:00 2001 From: Eduard Nicodei Date: Sat, 13 Jul 2024 10:11:24 +0100 Subject: [PATCH 05/50] update command for integration tests in docs --- development.markdown | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/development.markdown b/development.markdown index fa5d613b84..d6d3eb1df0 100644 --- a/development.markdown +++ b/development.markdown @@ -42,7 +42,7 @@ Some tests are executables instead: * `stack exec transcripts` runs the transcripts-related integration tests, found in `unison-src/transcripts`. You can add more tests to this directory. * `stack exec transcripts -- prefix-of-filename` runs only transcript tests with a matching filename prefix. -* `stack exec integration-tests` runs the additional integration tests for cli. These tests are not triggered by `tests` or `transcripts`. +* `stack exec cli-integration-tests` runs the additional integration tests for cli. These tests are not triggered by `tests` or `transcripts`. * `stack exec unison -- transcript unison-src/transcripts-round-trip/main.md` runs the pretty-printing round trip tests * `stack exec unison -- transcript unison-src/transcripts-manual/benchmarks.md` runs the benchmark suite. Output goes in unison-src/transcripts-manual/benchmarks/output.txt. From 55560e58ca4c5f3ed881633e7b20dd2d5d16327b Mon Sep 17 00:00:00 2001 From: Eduard Nicodei Date: Thu, 18 Jul 2024 00:29:03 +0100 Subject: [PATCH 06/50] update transcripts-manual/gen-racket-libs.output.md --- .../gen-racket-libs.output.md | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/unison-src/transcripts-manual/gen-racket-libs.output.md b/unison-src/transcripts-manual/gen-racket-libs.output.md index 1e003ab489..178d4b6f4e 100644 --- a/unison-src/transcripts-manual/gen-racket-libs.output.md +++ b/unison-src/transcripts-manual/gen-racket-libs.output.md @@ -1,22 +1,21 @@ - When we start out, `./scheme-libs/racket` contains a bunch of library files that we'll need. They define the Unison builtins for Racket. Next, we'll download the jit project and generate a few Racket files from it. -```ucm +``` ucm jit-setup/main> lib.install @unison/internal/releases/0.0.18 - Downloaded 14917 entities. + Downloaded 14949 entities. I installed @unison/internal/releases/0.0.18 as unison_internal_0_0_18. ``` -```unison +``` unison go = generateSchemeBoot "scheme-libs/racket" ``` -```ucm +``` ucm Loading changes detected in scratch.u. @@ -29,7 +28,7 @@ go = generateSchemeBoot "scheme-libs/racket" go : '{IO, Exception} () ``` -```ucm +``` ucm jit-setup/main> run go () @@ -42,16 +41,23 @@ and @unison/internal version. To set up racket to use these files, we need to create a package with them. This is accomplished by running. - raco pkg install -t dir unison +``` +raco pkg install -t dir unison +``` -in the directory where the `unison directory is located. Then the +in the directory where the `unison` directory is located. Then the runtime executable can be built with - raco exe scheme-libs/racket/unison-runtime.rkt +``` +raco exe scheme-libs/racket/unison-runtime.rkt +``` and a distributable directory can be produced with: - raco distribute scheme-libs/racket/unison-runtime +``` +raco distribute scheme-libs/racket/unison-runtime +``` At that point, should contain the executable and all dependencies necessary to run it. + From 7ed45f6cd77b3d724503e30ff1050052b5ed6269 Mon Sep 17 00:00:00 2001 From: Eduard Nicodei Date: Thu, 18 Jul 2024 00:44:22 +0100 Subject: [PATCH 07/50] minor fixups --- scheme-libs/racket/unison/Readme.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scheme-libs/racket/unison/Readme.md b/scheme-libs/racket/unison/Readme.md index 3984146df9..f84d73e57f 100644 --- a/scheme-libs/racket/unison/Readme.md +++ b/scheme-libs/racket/unison/Readme.md @@ -25,8 +25,8 @@ You'll need to have a couple things installed on your system: In particular, our crypto functions require on both `libcrypto` (from openssl) and `libb2`. You may have to tell racket where to find `libb2`, -by adding an entry to the hash table in your -[`config.rktd` file](https://docs.racket-lang.org/raco/config-file.html). +by adding an entry to the hash table in your `config.rktd` +[file](https://docs.racket-lang.org/raco/config-file.html). This is what I had, for an M1 mac w/ `libb2` installed via Homebrew: ``` $ cat scheme-libs/racket/config/config.rktd @@ -46,7 +46,7 @@ First, make sure unison is built (see [development](../../../development.markdow Next, use unison to generate the racket libraries. These are dependencies for building `unison-runtime`. -* Read [gen-racket-libs.md](../../../../unison-src-transcripts-manual/gen-racket-libs.md). +* Read [gen-racket-libs.md](../../../unison-src/transcripts-manual/gen-racket-libs.md). It will contain two things: * `ucm` and `unison` transcripts that generate the libraries * Instructions on how to build `unison-runtime` using `raco` From c657e589240c511394b77caf6df8a36c6dd8c6bb Mon Sep 17 00:00:00 2001 From: Eduard Nicodei Date: Thu, 18 Jul 2024 00:50:16 +0100 Subject: [PATCH 08/50] remove extra word and mention libressl --- scheme-libs/racket/unison/Readme.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/scheme-libs/racket/unison/Readme.md b/scheme-libs/racket/unison/Readme.md index f84d73e57f..c8f7e76eb4 100644 --- a/scheme-libs/racket/unison/Readme.md +++ b/scheme-libs/racket/unison/Readme.md @@ -22,12 +22,11 @@ You'll need to have a couple things installed on your system: * [Racket](https://racket-lang.org/), with the executable `racket` on your path somewhere * [BLAKE2](https://github.com/BLAKE2/libb2) (you may need to install this manually) - -In particular, our crypto functions require on both `libcrypto` (from -openssl) and `libb2`. You may have to tell racket where to find `libb2`, -by adding an entry to the hash table in your `config.rktd` -[file](https://docs.racket-lang.org/raco/config-file.html). -This is what I had, for an M1 mac w/ `libb2` installed via Homebrew: +In particular, our crypto functions require both `libcrypto` (from openssl or +eg. libressl) and `libb2`. You may have to tell racket where to find `libb2`, by +adding an entry to the hash table in your +[`config.rktd` file](https://docs.racket-lang.org/raco/config-file.html). +This is what I had, for an M1 mac with `libb2` installed via Homebrew: ``` $ cat scheme-libs/racket/config/config.rktd #hash( From 032e3609a0426cd97be13df16b82d721913f3287 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 24 Jul 2024 12:41:55 -0600 Subject: [PATCH 09/50] =?UTF-8?q?Add=20some=20transcripts=20that=20should?= =?UTF-8?q?=20error,=20but=20don=E2=80=99t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- unison-src/transcripts/errors/invalid-api-requests.md | 3 +++ unison-src/transcripts/errors/no-abspath-in-ucm.md | 5 +++++ 2 files changed, 8 insertions(+) create mode 100644 unison-src/transcripts/errors/invalid-api-requests.md create mode 100644 unison-src/transcripts/errors/no-abspath-in-ucm.md diff --git a/unison-src/transcripts/errors/invalid-api-requests.md b/unison-src/transcripts/errors/invalid-api-requests.md new file mode 100644 index 0000000000..12cfe78660 --- /dev/null +++ b/unison-src/transcripts/errors/invalid-api-requests.md @@ -0,0 +1,3 @@ +``` api:error +DELETE /something/important +``` diff --git a/unison-src/transcripts/errors/no-abspath-in-ucm.md b/unison-src/transcripts/errors/no-abspath-in-ucm.md new file mode 100644 index 0000000000..a982bb9855 --- /dev/null +++ b/unison-src/transcripts/errors/no-abspath-in-ucm.md @@ -0,0 +1,5 @@ +``` ucm:error +scratch/main> builtins.merge +-- As of 0.5.25, we no longer allow loose code paths for UCM commands. +.> ls +``` From bd4c2044ec770e82623104ac6690c3ffde911101 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 24 Jul 2024 12:42:57 -0600 Subject: [PATCH 10/50] Ensure transcript parser consumes entire stanzas MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the switch to `cmark`, the “second phase” parsing of individual stanzas omitted an EOF check to ensure that the entire stanza had been parsed. This resulted in parses where we end up with truncated sets of UCM commands or API requests, which could either result in premature success or failures occurring later in the transcript, where they’d complain about the wrong thing. --- .../src/Unison/Codebase/Transcript/Parser.hs | 31 +++++++------------ 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/unison-cli/src/Unison/Codebase/Transcript/Parser.hs b/unison-cli/src/Unison/Codebase/Transcript/Parser.hs index 8bbd8be622..47f7965240 100644 --- a/unison-cli/src/Unison/Codebase/Transcript/Parser.hs +++ b/unison-cli/src/Unison/Codebase/Transcript/Parser.hs @@ -24,11 +24,10 @@ where import CMark qualified import Data.Char qualified as Char import Data.Text qualified as Text -import Data.These (These (..)) import Text.Megaparsec qualified as P import Unison.Codebase.Transcript import Unison.Prelude -import Unison.Project (ProjectAndBranch (ProjectAndBranch)) +import Unison.Project (fullyQualifiedProjectAndBranchNamesParser) formatAPIRequest :: APIRequest -> Text formatAPIRequest = \case @@ -72,24 +71,16 @@ ucmLine :: P UcmLine ucmLine = ucmCommand <|> ucmComment where ucmCommand :: P UcmLine - ucmCommand = do - context <- - P.try do - contextString <- P.takeWhile1P Nothing (/= '>') - context <- - case (tryFrom @Text contextString) of - (Right (These project branch)) -> pure (UcmContextProject (ProjectAndBranch project branch)) - _ -> fail "expected project/branch or absolute path" - void $ lineToken $ word ">" - pure context - line <- P.takeWhileP Nothing (/= '\n') <* spaces - pure $ UcmCommand context line + ucmCommand = + UcmCommand + <$> fmap UcmContextProject (P.try $ fullyQualifiedProjectAndBranchNamesParser <* lineToken (word ">")) + <*> P.takeWhileP Nothing (/= '\n') + <* spaces ucmComment :: P UcmLine - ucmComment = do - word "--" - line <- P.takeWhileP Nothing (/= '\n') <* spaces - pure $ UcmComment line + ucmComment = + P.label "comment (delimited with “--”)" $ + UcmComment <$> (word "--" *> P.takeWhileP Nothing (/= '\n')) <* spaces apiRequest :: P APIRequest apiRequest = do @@ -118,7 +109,7 @@ fenced info = do hide <- hidden err <- expectingError P.setInput body - pure . Ucm hide err <$> (spaces *> many ucmLine) + pure . Ucm hide err <$> (spaces *> P.manyTill ucmLine P.eof) "unison" -> do -- todo: this has to be more interesting @@ -132,7 +123,7 @@ fenced info = do pure . Unison hide err fileName <$> (spaces *> P.getInput) "api" -> do P.setInput body - pure . API <$> (spaces *> many apiRequest) + pure . API <$> (spaces *> P.manyTill apiRequest P.eof) _ -> pure Nothing word :: Text -> P Text From 80143eb9a289710e72f520065b7d88b766da78d1 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 24 Jul 2024 12:58:15 -0600 Subject: [PATCH 11/50] Add transcripts for some already-fixed issues This fixes #1327 (which was actually fixed by #5056 or #5061) and #3977 was already closed, but without the transcript to avoid regression. --- unison-src/transcripts/fix1327.md | 11 ++++++ unison-src/transcripts/fix1327.output.md | 46 ++++++++++++++++++++++++ unison-src/transcripts/fix3977.md | 15 ++++++++ unison-src/transcripts/fix3977.output.md | 42 ++++++++++++++++++++++ 4 files changed, 114 insertions(+) create mode 100644 unison-src/transcripts/fix1327.md create mode 100644 unison-src/transcripts/fix1327.output.md create mode 100644 unison-src/transcripts/fix3977.md create mode 100644 unison-src/transcripts/fix3977.output.md diff --git a/unison-src/transcripts/fix1327.md b/unison-src/transcripts/fix1327.md new file mode 100644 index 0000000000..764d0f3ac5 --- /dev/null +++ b/unison-src/transcripts/fix1327.md @@ -0,0 +1,11 @@ +```unison +foo = 4 + +bar = 5 +``` + +```ucm +scratch/main> add +scratch/main> ls +scratch/main> alias.many 1-2 .ns1_nohistory +``` diff --git a/unison-src/transcripts/fix1327.output.md b/unison-src/transcripts/fix1327.output.md new file mode 100644 index 0000000000..fa542e6ed2 --- /dev/null +++ b/unison-src/transcripts/fix1327.output.md @@ -0,0 +1,46 @@ +``` unison +foo = 4 + +bar = 5 +``` + +``` ucm + + Loading changes detected in scratch.u. + + I found and typechecked these definitions in scratch.u. If you + do an `add` or `update`, here's how your codebase would + change: + + ⍟ These new definitions are ok to `add`: + + bar : ##Nat + foo : ##Nat + +``` +``` ucm +scratch/main> add + + ⍟ I've added these definitions: + + bar : ##Nat + foo : ##Nat + +scratch/main> ls + + 1. bar (##Nat) + 2. foo (##Nat) + +scratch/main> alias.many 1-2 .ns1_nohistory + + Here's what changed in .ns1_nohistory : + + Added definitions: + + 1. bar : ##Nat + 2. foo : ##Nat + + Tip: You can use `undo` or use a hash from `reflog` to undo + this change. + +``` diff --git a/unison-src/transcripts/fix3977.md b/unison-src/transcripts/fix3977.md new file mode 100644 index 0000000000..8ad82cbce9 --- /dev/null +++ b/unison-src/transcripts/fix3977.md @@ -0,0 +1,15 @@ +```ucm:hide +scratch/main> builtins.merge +``` + +```unison:hide +failure msg context = Failure (typeLink Unit) msg (Any context) + +foo = Left (failure ("a loooooooooooooooooooooooooooooooooong" ++ "message with concatenation") ()) +``` + +```ucm +scratch/main> add +scratch/main> edit foo +scratch/main> load scratch.u +``` diff --git a/unison-src/transcripts/fix3977.output.md b/unison-src/transcripts/fix3977.output.md new file mode 100644 index 0000000000..79a68eedc4 --- /dev/null +++ b/unison-src/transcripts/fix3977.output.md @@ -0,0 +1,42 @@ +``` unison +failure msg context = Failure (typeLink Unit) msg (Any context) + +foo = Left (failure ("a loooooooooooooooooooooooooooooooooong" ++ "message with concatenation") ()) +``` + +``` ucm +scratch/main> add + + ⍟ I've added these definitions: + + failure : Text -> context -> Failure + foo : Either Failure b + +scratch/main> edit foo + + ☝️ + + I added 1 definitions to the top of scratch.u + + You can edit them there, then run `update` to replace the + definitions currently in this namespace. + +scratch/main> load scratch.u + + Loading changes detected in scratch.u. + + I found and typechecked the definitions in scratch.u. This + file has been previously added to the codebase. + +``` +``` unison:added-by-ucm scratch.u +foo : Either Failure b +foo = + use Text ++ + Left + (failure + ("a loooooooooooooooooooooooooooooooooong" + ++ "message with concatenation") + ()) +``` + From 21209e2bdd48a16cca4d74d000d2f0494c78d3f5 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 1 Jul 2024 13:17:38 -0600 Subject: [PATCH 12/50] Extract the `Doc` lexer into a top-level function --- unison-syntax/src/Unison/Syntax/Lexer.hs | 857 ++++++++++++----------- 1 file changed, 429 insertions(+), 428 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 14fe31f9a7..144ccd95c3 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -400,6 +400,435 @@ restoreStack lbl p = do S.put (s2 {layout = layout1}) pure $ p <> closes +-- | The `Doc` lexer as documented on unison-lang.org +doc2 :: P [Token Lexeme] +doc2 = do + -- Ensure we're at a doc before we start consuming tokens + P.lookAhead (lit "{{") + openStart <- posP + -- Produce any layout tokens, such as closing the last open block or virtual semicolons + -- We don't use 'token' on "{{" directly because we don't want to duplicate layout + -- tokens if we do the rewrite hack for type-docs below. + beforeStartToks <- token' ignore (pure ()) + void $ lit "{{" + openEnd <- posP + CP.space + -- Construct the token for opening the doc block. + let openTok = Token (Open "syntax.docUntitledSection") openStart openEnd + env0 <- S.get + -- Disable layout while parsing the doc block and reset the section number + (bodyToks0, closeTok) <- local + ( \env -> + env + { inLayout = False, + parentSections = 0 : (parentSections env0) + } + ) + do + bodyToks <- body + closeStart <- posP + lit "}}" + closeEnd <- posP + pure (bodyToks, Token Close closeStart closeEnd) + let docToks = beforeStartToks <> [openTok] <> bodyToks0 <> [closeTok] + -- Parse any layout tokens after the doc block, e.g. virtual semicolon + endToks <- token' ignore (pure ()) + -- Hack to allow anonymous doc blocks before type decls + -- {{ Some docs }} Foo.doc = {{ Some docs }} + -- ability Foo where => ability Foo where + tn <- subsequentTypeName + pure $ case (tn) of + -- If we're followed by a type, we rewrite the doc block to be a named doc block. + (Just (WordyId tname)) + | isTopLevel -> + beforeStartToks + <> [WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment)) <$ openTok, Open "=" <$ openTok] + <> [openTok] + <> bodyToks0 + <> [closeTok] + -- We need an extra 'Close' here because we added an extra Open above. + <> [closeTok] + <> endToks + where + isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 + _ -> docToks <> endToks + where + wordyKw kw = separated wordySep (lit kw) + subsequentTypeName = P.lookAhead . P.optional $ do + let lit' s = lit s <* sp + let modifier = typeModifiersAlt (lit' . Text.unpack) + let typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) + _ <- optional modifier *> typeOrAbility' *> sp + Token name start stop <- tokenP identifierP + if Name.isSymboly (HQ'.toName name) + then P.customFailure (Token (InvalidSymbolyId (Text.unpack (HQ'.toTextWith Name.toText name))) start stop) + else pure (WordyId name) + ignore _ _ _ = [] + body = join <$> P.many (sectionElem <* CP.space) + sectionElem = section <|> fencedBlock <|> list <|> paragraph + paragraph = wrap "syntax.docParagraph" $ join <$> spaced leaf + reserved word = List.isPrefixOf "}}" word || all (== '#') word + + wordy closing = wrap "syntax.docWord" . tok . fmap Textual . P.try $ do + let end = + P.lookAhead $ + void docClose + <|> void docOpen + <|> void (P.satisfy isSpace) + <|> void closing + word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end + guard (not $ reserved word || null word) + pure word + + leafy closing = groupy closing gs + where + gs = + link + <|> externalLink + <|> exampleInline + <|> expr + <|> boldOrItalicOrStrikethrough closing + <|> verbatim + <|> atDoc + <|> wordy closing + + leaf = leafy mzero + + atDoc = src <|> evalInline <|> signature <|> signatureInline + where + comma = lit "," <* CP.space + src = + src' "syntax.docSource" "@source" + <|> src' "syntax.docFoldedSource" "@foldedSource" + srcElem = + wrap "syntax.docSourceElement" $ + (typeLink <|> termLink) + <+> ( fmap (fromMaybe []) . P.optional $ + (tok (Reserved <$> lit "@") <+> (CP.space *> annotations)) + ) + where + annotation = tok identifierLexemeP <|> expr <* CP.space + annotations = + join <$> P.some (wrap "syntax.docEmbedAnnotation" annotation) + src' name atName = wrap name $ do + _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space + s <- P.sepBy1 srcElem comma + _ <- lit "}" + pure (join s) + signature = wrap "syntax.docSignature" $ do + _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space + s <- join <$> P.sepBy1 signatureLink comma + _ <- lit "}" + pure s + signatureInline = wrap "syntax.docSignatureInline" $ do + _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space + s <- signatureLink + _ <- lit "}" + pure s + evalInline = wrap "syntax.docEvalInline" $ do + _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space + let inlineEvalClose = [] <$ lit "}" + s <- lexemes' inlineEvalClose + pure s + + typeLink = wrap "syntax.docEmbedTypeLink" do + _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space + tok identifierLexemeP <* CP.space + + termLink = + wrap "syntax.docEmbedTermLink" $ + tok identifierLexemeP <* CP.space + + signatureLink = + wrap "syntax.docEmbedSignatureLink" $ + tok identifierLexemeP <* CP.space + + groupy closing p = do + Token p start stop <- tokenP p + after <- P.optional . P.try $ leafy closing + pure $ case after of + Nothing -> p + Just after -> + [ Token (Open "syntax.docGroup") start stop', + Token (Open "syntax.docJoin") start stop' + ] + <> p + <> after + <> (take 2 $ repeat (Token Close stop' stop')) + where + stop' = maybe stop end (lastMay after) + + verbatim = + P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do + Token originalText start stop <- tokenP do + -- a single backtick followed by a non-backtick is treated as monospaced + let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) + -- also two or more ' followed by that number of closing ' + quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) + P.someTill P.anySingle (lit quotes) + let isMultiLine = line start /= line stop + if isMultiLine + then do + let trimmed = (trimAroundDelimiters originalText) + let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed + -- If it's a multi-line verbatim block we trim any whitespace representing + -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' + wrap "syntax.docVerbatim" $ + wrap "syntax.docWord" $ + pure [Token (Textual txt) start stop] + else + wrap "syntax.docCode" $ + wrap "syntax.docWord" $ + pure [Token (Textual originalText) start stop] + + exampleInline = + P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ + wrap "syntax.docExample" $ do + n <- P.try $ do + _ <- lit "`" + length <$> P.takeWhile1P (Just "backticks") (== '`') + let end :: P [Token Lexeme] = [] <$ lit (replicate (n + 1) '`') + ex <- CP.space *> lexemes' end + pure ex + + docClose = [] <$ lit "}}" + docOpen = [] <$ lit "{{" + + link = + P.label "link (examples: {type List}, {Nat.+})" $ + wrap "syntax.docLink" $ + P.try $ + lit "{" *> (typeLink <|> termLink) <* lit "}" + + expr = + P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ + openAs "{{" "syntax.docTransclude" + <+> do + env0 <- S.get + -- we re-allow layout within a transclusion, then restore it to its + -- previous state after + S.put (env0 {inLayout = True}) + -- Note: this P.lookAhead ensures the }} isn't consumed, + -- so it can be consumed below by the `close` which will + -- pop items off the layout stack up to the nearest enclosing + -- syntax.docTransclude. + ts <- lexemes' (P.lookAhead ([] <$ lit "}}")) + S.modify (\env -> env {inLayout = inLayout env0}) + pure ts + <+> close ["syntax.docTransclude"] (lit "}}") + + nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' + nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace + + -- Allows whitespace or a newline, but not more than two newlines in a row. + whitespaceWithoutParagraphBreak :: P () + whitespaceWithoutParagraphBreak = void do + void nonNewlineSpaces + optional newline >>= \case + Just _ -> void nonNewlineSpaces + Nothing -> pure () + + fencedBlock = + P.label "block eval (syntax: a fenced code block)" $ + evalUnison <|> exampleBlock <|> other + where + evalUnison = wrap "syntax.docEval" $ do + -- commit after seeing that ``` is on its own line + fence <- P.try $ do + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) + fence <$ guard b + CP.space + *> local + (\env -> env {inLayout = True, opening = Just "docEval"}) + (restoreStack "docEval" $ lexemes' ([] <$ lit fence)) + + exampleBlock = wrap "syntax.docExampleBlock" $ do + void $ lit "@typecheck" <* CP.space + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + local + (\env -> env {inLayout = True, opening = Just "docExampleBlock"}) + (restoreStack "docExampleBlock" $ lexemes' ([] <$ lit fence)) + + uncolumn column tabWidth s = + let skip col r | col < 1 = r + skip col s@('\t' : _) | col < tabWidth = s + skip col ('\t' : r) = skip (col - tabWidth) r + skip col (c : r) + | isSpace c && (not $ isControl c) = + skip (col - 1) r + skip _ s = s + in List.intercalate "\n" $ skip column <$> lines s + + other = wrap "syntax.docCodeBlock" $ do + column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel + let tabWidth = toInteger . P.unPos $ P.defaultTabWidth + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + name <- + P.takeWhileP Nothing nonNewlineSpace + *> tok (Textual <$> P.takeWhile1P Nothing (not . isSpace)) + <* P.takeWhileP Nothing nonNewlineSpace + _ <- void CP.eol + verbatim <- + tok $ + Textual . uncolumn column tabWidth . trimAroundDelimiters + <$> P.someTill P.anySingle ([] <$ lit fence) + pure (name <> verbatim) + + boldOrItalicOrStrikethrough closing = do + let start = + some (P.satisfy (== '*')) + <|> some (P.satisfy (== '_')) + <|> some + (P.satisfy (== '~')) + name s = + if take 1 s == "~" + then "syntax.docStrikethrough" + else if take 1 s == "*" then "syntax.docBold" else "syntax.docItalic" + end <- P.try $ do + end <- start + P.lookAhead (P.satisfy (not . isSpace)) + pure end + wrap (name end) . wrap "syntax.docParagraph" $ + join + <$> P.someTill + (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) + (lit end) + + externalLink = + P.label "hyperlink (example: [link name](https://destination.com))" $ + wrap "syntax.docNamedLink" $ do + _ <- lit "[" + p <- leafies (void $ char ']') + _ <- lit "]" + _ <- lit "(" + target <- + wrap "syntax.docGroup" . wrap "syntax.docJoin" $ + link <|> fmap join (P.some (expr <|> wordy (char ')'))) + _ <- lit ")" + pure (p <> target) + + -- newline = P.optional (lit "\r") *> lit "\n" + + sp = P.try $ do + spaces <- P.takeWhile1P (Just "space") isSpace + close <- P.optional (P.lookAhead (lit "}}")) + case close of + Nothing -> guard $ ok spaces + Just _ -> pure () + pure spaces + where + ok s = length [() | '\n' <- s] < 2 + + spaced p = P.some (p <* P.optional sp) + leafies close = wrap "syntax.docParagraph" $ join <$> spaced (leafy close) + + list = bulletedList <|> numberedList + + bulletedList = wrap "syntax.docBulletedList" $ join <$> P.sepBy1 bullet listSep + numberedList = wrap "syntax.docNumberedList" $ join <$> P.sepBy1 numberedItem listSep + + listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (bulletedStart <|> numberedStart) + + bulletedStart = P.try $ do + r <- listItemStart' $ [] <$ P.satisfy bulletChar + P.lookAhead (P.satisfy isSpace) + pure r + where + bulletChar ch = ch == '*' || ch == '-' || ch == '+' + + listItemStart' gutter = P.try $ do + nonNewlineSpaces + col <- column <$> posP + parentCol <- S.gets parentListColumn + guard (col > parentCol) + (col,) <$> gutter + + numberedStart = + listItemStart' $ P.try (tok . fmap num $ LP.decimal <* lit ".") + where + num :: Word -> Lexeme + num n = Numeric (show n) + + listItemParagraph = wrap "syntax.docParagraph" $ do + col <- column <$> posP + join <$> P.some (leaf <* sep col) + where + -- Trickiness here to support hard line breaks inside of + -- a bulleted list, so for instance this parses as expected: + -- + -- * uno dos + -- tres quatro + -- * alice bob + -- carol dave eve + sep col = do + _ <- nonNewlineSpaces + _ <- + P.optional . P.try $ + newline + *> nonNewlineSpaces + *> do + col2 <- column <$> posP + guard $ col2 >= col + (P.notFollowedBy $ numberedStart <|> bulletedStart) + pure () + + numberedItem = P.label msg $ do + (col, s) <- numberedStart + pure s + <+> ( wrap "syntax.docColumn" $ do + p <- nonNewlineSpaces *> listItemParagraph + subList <- + local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) + pure (p <> fromMaybe [] subList) + ) + where + msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" + + bullet = wrap "syntax.docColumn" . P.label "bullet (examples: * item1, - item2)" $ do + (col, _) <- bulletedStart + p <- nonNewlineSpaces *> listItemParagraph + subList <- + local + (\e -> e {parentListColumn = col}) + (P.optional $ listSep *> list) + pure (p <> fromMaybe [] subList) + + newline = P.label "newline" $ lit "\n" <|> lit "\r\n" + + -- ## Section title + -- + -- A paragraph under this section. + -- Part of the same paragraph. Blanklines separate paragraphs. + -- + -- ### A subsection title + -- + -- A paragraph under this subsection. + + -- # A section title (not a subsection) + section :: P [Token Lexeme] + section = wrap "syntax.docSection" $ do + ns <- S.gets parentSections + hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp + title <- paragraph <* CP.space + let m = length hashes + head ns + body <- + local (\env -> env {parentSections = (m : (tail ns))}) $ + P.many (sectionElem <* CP.space) + pure $ title <> join body + + wrap :: String -> P [Token Lexeme] -> P [Token Lexeme] + wrap o p = do + start <- posP + lexemes <- p + pure $ go start lexemes + where + go start [] = [Token (Open o) start start, Token Close start start] + go start ts@(Token _ x _ : _) = + Token (Open o) start x : (ts ++ [Token Close (end final) (end final)]) + where + final = last ts + lexemes' :: P [Token Lexeme] -> P [Token Lexeme] lexemes' eof = P.optional space >> do @@ -418,434 +847,6 @@ lexemes' eof = <|> token identifierLexemeP <|> (asum . map token) [semi, textual, hash] - doc2 :: P [Token Lexeme] - doc2 = do - -- Ensure we're at a doc before we start consuming tokens - P.lookAhead (lit "{{") - openStart <- posP - -- Produce any layout tokens, such as closing the last open block or virtual semicolons - -- We don't use 'token' on "{{" directly because we don't want to duplicate layout - -- tokens if we do the rewrite hack for type-docs below. - beforeStartToks <- token' ignore (pure ()) - void $ lit "{{" - openEnd <- posP - CP.space - -- Construct the token for opening the doc block. - let openTok = Token (Open "syntax.docUntitledSection") openStart openEnd - env0 <- S.get - -- Disable layout while parsing the doc block and reset the section number - (bodyToks0, closeTok) <- local - ( \env -> - env - { inLayout = False, - parentSections = 0 : (parentSections env0) - } - ) - do - bodyToks <- body - closeStart <- posP - lit "}}" - closeEnd <- posP - pure (bodyToks, Token Close closeStart closeEnd) - let docToks = beforeStartToks <> [openTok] <> bodyToks0 <> [closeTok] - -- Parse any layout tokens after the doc block, e.g. virtual semicolon - endToks <- token' ignore (pure ()) - -- Hack to allow anonymous doc blocks before type decls - -- {{ Some docs }} Foo.doc = {{ Some docs }} - -- ability Foo where => ability Foo where - tn <- subsequentTypeName - pure $ case (tn) of - -- If we're followed by a type, we rewrite the doc block to be a named doc block. - (Just (WordyId tname)) - | isTopLevel -> - beforeStartToks - <> [WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment)) <$ openTok, Open "=" <$ openTok] - <> [openTok] - <> bodyToks0 - <> [closeTok] - -- We need an extra 'Close' here because we added an extra Open above. - <> [closeTok] - <> endToks - where - isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 - _ -> docToks <> endToks - where - wordyKw kw = separated wordySep (lit kw) - subsequentTypeName = P.lookAhead . P.optional $ do - let lit' s = lit s <* sp - let modifier = typeModifiersAlt (lit' . Text.unpack) - let typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) - _ <- optional modifier *> typeOrAbility' *> sp - Token name start stop <- tokenP identifierP - if Name.isSymboly (HQ'.toName name) - then P.customFailure (Token (InvalidSymbolyId (Text.unpack (HQ'.toTextWith Name.toText name))) start stop) - else pure (WordyId name) - ignore _ _ _ = [] - body = join <$> P.many (sectionElem <* CP.space) - sectionElem = section <|> fencedBlock <|> list <|> paragraph - paragraph = wrap "syntax.docParagraph" $ join <$> spaced leaf - reserved word = List.isPrefixOf "}}" word || all (== '#') word - - wordy closing = wrap "syntax.docWord" . tok . fmap Textual . P.try $ do - let end = - P.lookAhead $ - void docClose - <|> void docOpen - <|> void (P.satisfy isSpace) - <|> void closing - word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end - guard (not $ reserved word || null word) - pure word - - leafy closing = groupy closing gs - where - gs = - link - <|> externalLink - <|> exampleInline - <|> expr - <|> boldOrItalicOrStrikethrough closing - <|> verbatim - <|> atDoc - <|> wordy closing - - leaf = leafy mzero - - atDoc = src <|> evalInline <|> signature <|> signatureInline - where - comma = lit "," <* CP.space - src = - src' "syntax.docSource" "@source" - <|> src' "syntax.docFoldedSource" "@foldedSource" - srcElem = - wrap "syntax.docSourceElement" $ - (typeLink <|> termLink) - <+> ( fmap (fromMaybe []) . P.optional $ - (tok (Reserved <$> lit "@") <+> (CP.space *> annotations)) - ) - where - annotation = tok identifierLexemeP <|> expr <* CP.space - annotations = - join <$> P.some (wrap "syntax.docEmbedAnnotation" annotation) - src' name atName = wrap name $ do - _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space - s <- P.sepBy1 srcElem comma - _ <- lit "}" - pure (join s) - signature = wrap "syntax.docSignature" $ do - _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space - s <- join <$> P.sepBy1 signatureLink comma - _ <- lit "}" - pure s - signatureInline = wrap "syntax.docSignatureInline" $ do - _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space - s <- signatureLink - _ <- lit "}" - pure s - evalInline = wrap "syntax.docEvalInline" $ do - _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space - let inlineEvalClose = [] <$ lit "}" - s <- lexemes' inlineEvalClose - pure s - - typeLink = wrap "syntax.docEmbedTypeLink" do - _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tok identifierLexemeP <* CP.space - - termLink = - wrap "syntax.docEmbedTermLink" $ - tok identifierLexemeP <* CP.space - - signatureLink = - wrap "syntax.docEmbedSignatureLink" $ - tok identifierLexemeP <* CP.space - - groupy closing p = do - Token p start stop <- tokenP p - after <- P.optional . P.try $ leafy closing - pure $ case after of - Nothing -> p - Just after -> - [ Token (Open "syntax.docGroup") start stop', - Token (Open "syntax.docJoin") start stop' - ] - <> p - <> after - <> (take 2 $ repeat (Token Close stop' stop')) - where - stop' = maybe stop end (lastMay after) - - verbatim = - P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do - Token originalText start stop <- tokenP do - -- a single backtick followed by a non-backtick is treated as monospaced - let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) - -- also two or more ' followed by that number of closing ' - quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) - P.someTill P.anySingle (lit quotes) - let isMultiLine = line start /= line stop - if isMultiLine - then do - let trimmed = (trimAroundDelimiters originalText) - let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed - -- If it's a multi-line verbatim block we trim any whitespace representing - -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - wrap "syntax.docVerbatim" $ - wrap "syntax.docWord" $ - pure [Token (Textual txt) start stop] - else - wrap "syntax.docCode" $ - wrap "syntax.docWord" $ - pure [Token (Textual originalText) start stop] - - exampleInline = - P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ - wrap "syntax.docExample" $ do - n <- P.try $ do - _ <- lit "`" - length <$> P.takeWhile1P (Just "backticks") (== '`') - let end :: P [Token Lexeme] = [] <$ lit (replicate (n + 1) '`') - ex <- CP.space *> lexemes' end - pure ex - - docClose = [] <$ lit "}}" - docOpen = [] <$ lit "{{" - - link = - P.label "link (examples: {type List}, {Nat.+})" $ - wrap "syntax.docLink" $ - P.try $ - lit "{" *> (typeLink <|> termLink) <* lit "}" - - expr = - P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ - openAs "{{" "syntax.docTransclude" - <+> do - env0 <- S.get - -- we re-allow layout within a transclusion, then restore it to its - -- previous state after - S.put (env0 {inLayout = True}) - -- Note: this P.lookAhead ensures the }} isn't consumed, - -- so it can be consumed below by the `close` which will - -- pop items off the layout stack up to the nearest enclosing - -- syntax.docTransclude. - ts <- lexemes' (P.lookAhead ([] <$ lit "}}")) - S.modify (\env -> env {inLayout = inLayout env0}) - pure ts - <+> close ["syntax.docTransclude"] (lit "}}") - - nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' - nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace - - -- Allows whitespace or a newline, but not more than two newlines in a row. - whitespaceWithoutParagraphBreak :: P () - whitespaceWithoutParagraphBreak = void do - void nonNewlineSpaces - optional newline >>= \case - Just _ -> void nonNewlineSpaces - Nothing -> pure () - - fencedBlock = - P.label "block eval (syntax: a fenced code block)" $ - evalUnison <|> exampleBlock <|> other - where - evalUnison = wrap "syntax.docEval" $ do - -- commit after seeing that ``` is on its own line - fence <- P.try $ do - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) - fence <$ guard b - CP.space - *> local - (\env -> env {inLayout = True, opening = Just "docEval"}) - (restoreStack "docEval" $ lexemes' ([] <$ lit fence)) - - exampleBlock = wrap "syntax.docExampleBlock" $ do - void $ lit "@typecheck" <* CP.space - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - local - (\env -> env {inLayout = True, opening = Just "docExampleBlock"}) - (restoreStack "docExampleBlock" $ lexemes' ([] <$ lit fence)) - - uncolumn column tabWidth s = - let skip col r | col < 1 = r - skip col s@('\t' : _) | col < tabWidth = s - skip col ('\t' : r) = skip (col - tabWidth) r - skip col (c : r) - | isSpace c && (not $ isControl c) = - skip (col - 1) r - skip _ s = s - in List.intercalate "\n" $ skip column <$> lines s - - other = wrap "syntax.docCodeBlock" $ do - column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel - let tabWidth = toInteger . P.unPos $ P.defaultTabWidth - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - name <- - P.takeWhileP Nothing nonNewlineSpace - *> tok (Textual <$> P.takeWhile1P Nothing (not . isSpace)) - <* P.takeWhileP Nothing nonNewlineSpace - _ <- void CP.eol - verbatim <- - tok $ - Textual . uncolumn column tabWidth . trimAroundDelimiters - <$> P.someTill P.anySingle ([] <$ lit fence) - pure (name <> verbatim) - - boldOrItalicOrStrikethrough closing = do - let start = - some (P.satisfy (== '*')) - <|> some (P.satisfy (== '_')) - <|> some - (P.satisfy (== '~')) - name s = - if take 1 s == "~" - then "syntax.docStrikethrough" - else if take 1 s == "*" then "syntax.docBold" else "syntax.docItalic" - end <- P.try $ do - end <- start - P.lookAhead (P.satisfy (not . isSpace)) - pure end - wrap (name end) . wrap "syntax.docParagraph" $ - join - <$> P.someTill - (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) - (lit end) - - externalLink = - P.label "hyperlink (example: [link name](https://destination.com))" $ - wrap "syntax.docNamedLink" $ do - _ <- lit "[" - p <- leafies (void $ char ']') - _ <- lit "]" - _ <- lit "(" - target <- - wrap "syntax.docGroup" . wrap "syntax.docJoin" $ - link <|> fmap join (P.some (expr <|> wordy (char ')'))) - _ <- lit ")" - pure (p <> target) - - -- newline = P.optional (lit "\r") *> lit "\n" - - sp = P.try $ do - spaces <- P.takeWhile1P (Just "space") isSpace - close <- P.optional (P.lookAhead (lit "}}")) - case close of - Nothing -> guard $ ok spaces - Just _ -> pure () - pure spaces - where - ok s = length [() | '\n' <- s] < 2 - - spaced p = P.some (p <* P.optional sp) - leafies close = wrap "syntax.docParagraph" $ join <$> spaced (leafy close) - - list = bulletedList <|> numberedList - - bulletedList = wrap "syntax.docBulletedList" $ join <$> P.sepBy1 bullet listSep - numberedList = wrap "syntax.docNumberedList" $ join <$> P.sepBy1 numberedItem listSep - - listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (bulletedStart <|> numberedStart) - - bulletedStart = P.try $ do - r <- listItemStart' $ [] <$ P.satisfy bulletChar - P.lookAhead (P.satisfy isSpace) - pure r - where - bulletChar ch = ch == '*' || ch == '-' || ch == '+' - - listItemStart' gutter = P.try $ do - nonNewlineSpaces - col <- column <$> posP - parentCol <- S.gets parentListColumn - guard (col > parentCol) - (col,) <$> gutter - - numberedStart = - listItemStart' $ P.try (tok . fmap num $ LP.decimal <* lit ".") - where - num :: Word -> Lexeme - num n = Numeric (show n) - - listItemParagraph = wrap "syntax.docParagraph" $ do - col <- column <$> posP - join <$> P.some (leaf <* sep col) - where - -- Trickiness here to support hard line breaks inside of - -- a bulleted list, so for instance this parses as expected: - -- - -- * uno dos - -- tres quatro - -- * alice bob - -- carol dave eve - sep col = do - _ <- nonNewlineSpaces - _ <- - P.optional . P.try $ - newline - *> nonNewlineSpaces - *> do - col2 <- column <$> posP - guard $ col2 >= col - (P.notFollowedBy $ numberedStart <|> bulletedStart) - pure () - - numberedItem = P.label msg $ do - (col, s) <- numberedStart - pure s - <+> ( wrap "syntax.docColumn" $ do - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) - pure (p <> fromMaybe [] subList) - ) - where - msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" - - bullet = wrap "syntax.docColumn" . P.label "bullet (examples: * item1, - item2)" $ do - (col, _) <- bulletedStart - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local - (\e -> e {parentListColumn = col}) - (P.optional $ listSep *> list) - pure (p <> fromMaybe [] subList) - - newline = P.label "newline" $ lit "\n" <|> lit "\r\n" - - -- ## Section title - -- - -- A paragraph under this section. - -- Part of the same paragraph. Blanklines separate paragraphs. - -- - -- ### A subsection title - -- - -- A paragraph under this subsection. - - -- # A section title (not a subsection) - section :: P [Token Lexeme] - section = wrap "syntax.docSection" $ do - ns <- S.gets parentSections - hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp - title <- paragraph <* CP.space - let m = length hashes + head ns - body <- - local (\env -> env {parentSections = (m : (tail ns))}) $ - P.many (sectionElem <* CP.space) - pure $ title <> join body - - wrap :: String -> P [Token Lexeme] -> P [Token Lexeme] - wrap o p = do - start <- posP - lexemes <- p - pure $ go start lexemes - where - go start [] = [Token (Open o) start start, Token Close start start] - go start ts@(Token _ x _ : _) = - Token (Open o) start x : (ts ++ [Token Close (end final) (end final)]) - where - final = last ts - doc :: P [Token Lexeme] doc = open <+> (CP.space *> fmap fixup body) <+> (close <* space) where From d1fe6d9429ac5ed69b101fde6bbc353195f2edf5 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 1 Jul 2024 13:43:06 -0600 Subject: [PATCH 13/50] Separate the `Doc` lexer from the Unison lexer `doc2` is a Unison lexer that traverses a `Doc`. `docBody` is the actual `Doc` lexer that is ignorant of the fact that Unison wraps `Doc` blocks in `{{`/`}}`. --- unison-syntax/src/Unison/Syntax/Lexer.hs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 144ccd95c3..77c91b8e84 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -425,7 +425,7 @@ doc2 = do } ) do - bodyToks <- body + bodyToks <- docBody closeStart <- posP lit "}}" closeEnd <- posP @@ -453,6 +453,7 @@ doc2 = do isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 _ -> docToks <> endToks where + -- DUPLICATED wordyKw kw = separated wordySep (lit kw) subsequentTypeName = P.lookAhead . P.optional $ do let lit' s = lit s <* sp @@ -464,7 +465,23 @@ doc2 = do then P.customFailure (Token (InvalidSymbolyId (Text.unpack (HQ'.toTextWith Name.toText name))) start stop) else pure (WordyId name) ignore _ _ _ = [] - body = join <$> P.many (sectionElem <* CP.space) + -- DUPLICATED + sp = P.try $ do + spaces <- P.takeWhile1P (Just "space") isSpace + close <- P.optional (P.lookAhead (lit "}}")) + case close of + Nothing -> guard $ ok spaces + Just _ -> pure () + pure spaces + where + ok s = length [() | '\n' <- s] < 2 + +-- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that +-- Unison wraps `Doc` literals in `}}`). +docBody :: P [Token Lexeme] +docBody = join <$> P.many (sectionElem <* CP.space) + where + wordyKw kw = separated wordySep (lit kw) sectionElem = section <|> fencedBlock <|> list <|> paragraph paragraph = wrap "syntax.docParagraph" $ join <$> spaced leaf reserved word = List.isPrefixOf "}}" word || all (== '#') word From 543daa36c74ab0bf1d4643b37b4baef30de88994 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 3 Jul 2024 12:59:09 -0600 Subject: [PATCH 14/50] Move the `Annotated` class to the `Ann` module This is in preparation for using `Ann` in the `Lexer` module, as that module actually does some parsing. --- unison-syntax/src/Unison/Parser/Ann.hs | 21 +++++++++++++++++++++ unison-syntax/src/Unison/Syntax/Lexer.hs | 5 +++++ unison-syntax/src/Unison/Syntax/Parser.hs | 16 ++-------------- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/unison-syntax/src/Unison/Parser/Ann.hs b/unison-syntax/src/Unison/Parser/Ann.hs index feec96279c..961bbcb30c 100644 --- a/unison-syntax/src/Unison/Parser/Ann.hs +++ b/unison-syntax/src/Unison/Parser/Ann.hs @@ -4,7 +4,10 @@ module Unison.Parser.Ann where +import Data.List.NonEmpty (NonEmpty) +import Data.Void (absurd) import Unison.Lexer.Pos qualified as L +import Unison.Prelude data Ann = -- Used for things like Builtins which don't have a source position. @@ -79,3 +82,21 @@ encompasses (GeneratedFrom ann) other = encompasses ann other encompasses ann (GeneratedFrom other) = encompasses ann other encompasses (Ann start1 end1) (Ann start2 end2) = Just $ start1 <= start2 && end1 >= end2 + +class Annotated a where + ann :: a -> Ann + +instance Annotated Ann where + ann = id + +instance (Annotated a) => Annotated [a] where + ann = foldMap ann + +instance (Annotated a) => Annotated (NonEmpty a) where + ann = foldMap ann + +instance (Annotated a) => Annotated (Maybe a) where + ann = foldMap ann + +instance Annotated Void where + ann = absurd diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 77c91b8e84..fa169e2d06 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -1,4 +1,5 @@ {-# LANGUAGE TemplateHaskell #-} +{-# OPTIONS_GHC -Wno-orphans #-} module Unison.Syntax.Lexer ( Token (..), @@ -51,6 +52,7 @@ import Unison.Name qualified as Name import Unison.NameSegment (NameSegment) import Unison.NameSegment qualified as NameSegment (docSegment) import Unison.NameSegment.Internal qualified as NameSegment +import Unison.Parser.Ann (Ann (Ann), Annotated (..)) import Unison.Prelude import Unison.ShortHash (ShortHash) import Unison.ShortHash qualified as SH @@ -64,6 +66,9 @@ import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) import Unison.Util.Bytes qualified as Bytes import Unison.Util.Monoid (intercalateMap) +instance Annotated (Token a) where + ann (Token _ s e) = Ann s e + type BlockName = String type Layout = [(BlockName, Column)] diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index affab5bf2c..4945f4347e 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -1,4 +1,5 @@ {-# LANGUAGE UndecidableInstances #-} +{-# OPTIONS_GHC -Wno-orphans #-} module Unison.Syntax.Parser ( Annotated (..), @@ -77,7 +78,7 @@ import Unison.Hashable qualified as Hashable import Unison.Name as Name import Unison.Names (Names) import Unison.Names.ResolutionResult qualified as Names -import Unison.Parser.Ann (Ann (..)) +import Unison.Parser.Ann (Ann (..), Annotated (..)) import Unison.Pattern (Pattern) import Unison.Pattern qualified as Pattern import Unison.Prelude @@ -177,25 +178,12 @@ newtype Input = Input {inputStream :: [L.Token L.Lexeme]} deriving stock (Eq, Ord, Show) deriving newtype (P.Stream, P.VisualStream) -class Annotated a where - ann :: a -> Ann - -instance Annotated Ann where - ann = id - -instance Annotated (L.Token a) where - ann (L.Token _ s e) = Ann s e - instance (Annotated a) => Annotated (ABT.Term f v a) where ann = ann . ABT.annotation instance (Annotated a) => Annotated (Pattern a) where ann = ann . Pattern.loc -instance (Annotated a) => Annotated [a] where - ann [] = mempty - ann (h : t) = foldl' (\acc a -> acc <> ann a) (ann h) t - instance (Annotated a, Annotated b) => Annotated (MatchCase a b) where ann (MatchCase p _ b) = ann p <> ann b From 5f87b4152739662e20120810ee0525228f9a3363 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 1 Jul 2024 21:11:53 -0600 Subject: [PATCH 15/50] Un-hiding the `Doc` parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `doc2` was a parser in lexer’s clothing. It would parse recursively, but then return the result as a flat list of tokens. This separates the parsing from the “unparsing” (which returns the tokens), so now we have a parser to a recursive `Doc` structure. This currently immediately applies the unparser, and should result in an identical stream of tokens as the previous version. Eventually, we should be able to avoid unparsing the `Doc` structure. --- unison-syntax/package.yaml | 1 + unison-syntax/src/Unison/Syntax/Lexer.hs | 451 ++++++++++++++++------- unison-syntax/unison-syntax.cabal | 2 + 3 files changed, 330 insertions(+), 124 deletions(-) diff --git a/unison-syntax/package.yaml b/unison-syntax/package.yaml index 8e1a478baf..ccb1a057d7 100644 --- a/unison-syntax/package.yaml +++ b/unison-syntax/package.yaml @@ -10,6 +10,7 @@ dependencies: - containers - cryptonite - extra + - free - lens - megaparsec - mtl diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index fa169e2d06..fd27118050 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -8,6 +8,16 @@ module Unison.Syntax.Lexer Err (..), Pos (..), Lexeme (..), + DocTree, + DocUntitledSection (..), + DocTop (..), + DocColumn (..), + DocLeaf (..), + DocEmbedLink (..), + DocSourceElement (..), + DocEmbedSignatureLink (..), + DocJoin (..), + DocEmbedAnnotation (..), lexer, escapeChars, debugFileLex, @@ -28,16 +38,19 @@ module Unison.Syntax.Lexer ) where +import Control.Comonad.Cofree (Cofree ((:<))) import Control.Monad.State qualified as S import Data.Char (isAlphaNum, isControl, isDigit, isSpace, ord, toLower) import Data.Foldable qualified as Foldable import Data.List qualified as List import Data.List.Extra qualified as List +import Data.List.NonEmpty (NonEmpty ((:|))) import Data.List.NonEmpty qualified as Nel import Data.List.NonEmpty qualified as NonEmpty import Data.Map.Strict qualified as Map import Data.Set qualified as Set import Data.Text qualified as Text +import Data.Void (vacuous) import GHC.Exts (sortWith) import Text.Megaparsec qualified as P import Text.Megaparsec.Char (char) @@ -52,7 +65,7 @@ import Unison.Name qualified as Name import Unison.NameSegment (NameSegment) import Unison.NameSegment qualified as NameSegment (docSegment) import Unison.NameSegment.Internal qualified as NameSegment -import Unison.Parser.Ann (Ann (Ann), Annotated (..)) +import Unison.Parser.Ann (Ann (Ann, GeneratedFrom), Annotated (..)) import Unison.Prelude import Unison.ShortHash (ShortHash) import Unison.ShortHash qualified as SH @@ -66,6 +79,9 @@ import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) import Unison.Util.Bytes qualified as Bytes import Unison.Util.Monoid (intercalateMap) +instance (Annotated a) => Annotated (Cofree f a) where + ann (a :< _) = ann a + instance Annotated (Token a) where ann (Token _ s e) = Ann s e @@ -418,11 +434,9 @@ doc2 = do void $ lit "{{" openEnd <- posP CP.space - -- Construct the token for opening the doc block. - let openTok = Token (Open "syntax.docUntitledSection") openStart openEnd env0 <- S.get -- Disable layout while parsing the doc block and reset the section number - (bodyToks0, closeTok) <- local + (docToks, closeTok) <- local ( \env -> env { inLayout = False, @@ -430,33 +444,30 @@ doc2 = do } ) do - bodyToks <- docBody + bodyToks <- docBody (lit "}}") closeStart <- posP lit "}}" closeEnd <- posP - pure (bodyToks, Token Close closeStart closeEnd) - let docToks = beforeStartToks <> [openTok] <> bodyToks0 <> [closeTok] + pure (docToLexemes (openStart, closeEnd) bodyToks, Token Close closeStart closeEnd) -- Parse any layout tokens after the doc block, e.g. virtual semicolon endToks <- token' ignore (pure ()) -- Hack to allow anonymous doc blocks before type decls -- {{ Some docs }} Foo.doc = {{ Some docs }} -- ability Foo where => ability Foo where tn <- subsequentTypeName - pure $ case (tn) of - -- If we're followed by a type, we rewrite the doc block to be a named doc block. - (Just (WordyId tname)) - | isTopLevel -> - beforeStartToks - <> [WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment)) <$ openTok, Open "=" <$ openTok] - <> [openTok] - <> bodyToks0 - <> [closeTok] - -- We need an extra 'Close' here because we added an extra Open above. - <> [closeTok] - <> endToks - where - isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 - _ -> docToks <> endToks + pure $ + beforeStartToks <> case (tn) of + -- If we're followed by a type, we rewrite the doc block to be a named doc block. + Just (WordyId tname) + | isTopLevel -> + Token (WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment))) openStart openEnd + : Token (Open "=") openStart openEnd + : docToks + -- We need an extra 'Close' here because we added an extra Open above. + <> (closeTok : endToks) + where + isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 + _ -> docToks <> endToks where -- DUPLICATED wordyKw kw = separated wordySep (lit kw) @@ -481,17 +492,221 @@ doc2 = do where ok s = length [() | '\n' <- s] < 2 +-- | Like `P.some`, but returns an actual `NonEmpty`. +some' :: P a -> P (NonEmpty a) +some' p = liftA2 (:|) p $ many p + +-- | Like `P.someTill`, but returns an actual `NonEmpty`. +someTill' :: P a -> P end -> P (NonEmpty a) +someTill' p end = liftA2 (:|) p $ P.manyTill p end + +-- | Like `P.sepBy1`, but returns an actual `NonEmpty`. +sepBy1' :: P a -> P sep -> P (NonEmpty a) +sepBy1' p sep = liftA2 (:|) p . many $ sep *> p + +newtype DocUntitledSection a = DocUntitledSection [a] + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +-- | Haskell parallel to @unison/base.Doc@. +-- +-- This is much more restricted than @unison/base.Doc@, but it covers everything we can parse from Haskell. The +-- mismatch with Unison is a problem, as someone can create a Unison Doc with explicit constructors or function calls, +-- have it rendered to a scratch file, and then we can’t parse it. Changing the types here to match Unison wouldn’t +-- fix the issue. We have to modify the types and parser in concert (in both Haskell and Unison) to bring them in +-- line. +-- +-- __NB__: Uses of @[`Token` `Lexeme`]@ here indicate a nested transition to the Unison lexer. +data DocTop a + = -- | The first argument is always a Paragraph + DocSection a [a] + | DocEval [Token Lexeme] + | DocExampleBlock [Token Lexeme] + | DocCodeBlock (Token String) (Token String) + | DocBulletedList (NonEmpty (DocColumn a)) + | DocNumberedList (NonEmpty (Token Word64, DocColumn a)) + | DocParagraph (NonEmpty (DocLeaf a)) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +data DocColumn a + = -- | The first is always a Paragraph, and the second a Bulleted or Numbered List + DocColumn a (Maybe a) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +data DocLeaf a + = DocLink DocEmbedLink + | -- | first is a Paragraph, second is always a Group (which contains either a single Term/Type link or list of + -- Transcludes & Words) + DocNamedLink a (DocLeaf Void) + | DocExample [Token Lexeme] + | DocTransclude [Token Lexeme] + | -- | Always a Paragraph + DocBold a + | -- | Always a Paragraph + DocItalic a + | -- | Always a Paragraph + DocStrikethrough a + | -- | Always a Word + DocVerbatim (DocLeaf Void) + | -- | Always a Word + DocCode (DocLeaf Void) + | DocSource (NonEmpty DocSourceElement) + | DocFoldedSource (NonEmpty DocSourceElement) + | DocEvalInline [Token Lexeme] + | DocSignature (NonEmpty DocEmbedSignatureLink) + | DocSignatureInline DocEmbedSignatureLink + | DocWord (Token String) + | DocGroup (DocJoin a) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +data DocEmbedLink + = DocEmbedTypeLink (Token (HQ'.HashQualified Name)) + | DocEmbedTermLink (Token (HQ'.HashQualified Name)) + deriving (Eq, Ord, Show) + +data DocSourceElement = DocSourceElement DocEmbedLink [DocEmbedAnnotation] + deriving (Eq, Ord, Show) + +newtype DocEmbedSignatureLink = DocEmbedSignatureLink (Token (HQ'.HashQualified Name)) + deriving (Eq, Ord, Show) + +newtype DocJoin a = DocJoin (NonEmpty (DocLeaf a)) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +newtype DocEmbedAnnotation + = -- | Always a DocTransclude + DocEmbedAnnotation (Either (Token (HQ'.HashQualified Name)) (DocLeaf Void)) + deriving (Eq, Ord, Show) + +type DocTree = Cofree DocTop Ann + +instance (Annotated a) => Annotated (DocTop a) where + ann = \case + DocSection title body -> ann title <> ann body + DocEval code -> ann code + DocExampleBlock code -> ann code + DocCodeBlock label body -> ann label <> ann body + DocBulletedList items -> ann items + DocNumberedList items -> ann $ snd <$> items + DocParagraph leaves -> ann leaves + +instance (Annotated a) => Annotated (DocColumn a) where + ann (DocColumn para list) = ann para <> ann list + +instance (Annotated a) => Annotated (DocLeaf a) where + ann = \case + DocLink link -> ann link + DocNamedLink label target -> ann label <> ann target + DocExample code -> ann code + DocTransclude code -> ann code + DocBold para -> ann para + DocItalic para -> ann para + DocStrikethrough para -> ann para + DocVerbatim word -> ann word + DocCode word -> ann word + DocSource elems -> ann elems + DocFoldedSource elems -> ann elems + DocEvalInline code -> ann code + DocSignature links -> ann links + DocSignatureInline link -> ann link + DocWord text -> ann text + DocGroup (DocJoin leaves) -> ann leaves + +instance Annotated DocEmbedLink where + ann = \case + DocEmbedTypeLink name -> ann name + DocEmbedTermLink name -> ann name + +instance Annotated DocSourceElement where + ann (DocSourceElement link target) = ann link <> ann target + +instance Annotated DocEmbedSignatureLink where + ann (DocEmbedSignatureLink name) = ann name + +instance Annotated DocEmbedAnnotation where + ann (DocEmbedAnnotation a) = either ann ann a + +-- | This is a short-term hack to turn our parse tree back into the sequence of lexemes the current parser expects. +-- +-- The medium-term solution is to preserve @[`DocTree`]@ as its own lexeme type, and hand it to the parser without +-- flattening it back to tokens. Longer-term, maybe we add a real lexer for @Doc@, and then whatever is left of this +-- parser moves into the actual parser. +docToLexemes :: (Pos, Pos) -> DocUntitledSection DocTree -> [Token Lexeme] +docToLexemes (startDoc, endDoc) (DocUntitledSection tops) = + Token (Open "syntax.docUntitledSection") startDoc startDoc + : concatMap cata tops <> pure (Token Close endDoc endDoc) + where + wrap :: Ann -> String -> [Token Lexeme] -> [Token Lexeme] + wrap ann suffix lexemes = go (extractStart ann) lexemes + where + extractStart = \case + Ann start _ -> start + GeneratedFrom a -> extractStart a + a -> error $ "expected a good Pos! Got: " <> show a + o = "syntax.doc" <> suffix + go start [] = [Token (Open o) start start, Token Close start start] + go start ts@(Token _ x _ : _) = + Token (Open o) start x : (ts ++ [Token Close (end final) (end final)]) + where + final = last ts + cata :: DocTree -> [Token Lexeme] + cata (a :< top) = docTop a $ cata <$> top + docTop start = \case + DocSection title body -> wrap start "Section" $ title <> join body + DocEval code -> wrap start "Eval" code + DocExampleBlock code -> wrap start "ExampleBlock" code + DocCodeBlock label text -> wrap start "CodeBlock" [Textual <$> label, Textual <$> text] + DocBulletedList items -> wrap start "BulletedList" . concat $ (\col -> docColumn (ann col) col) <$> items + DocNumberedList items -> + wrap start "NumberedList" . concat $ + uncurry (:) . bimap (Numeric . show <$>) (\col -> docColumn (ann col) col) + <$> items + DocParagraph body -> wrap start "Paragraph" . concat $ (\l -> docLeaf (ann l) l) <$> body + docColumn start (DocColumn para mlist) = wrap start "Column" $ foldr (flip (<>)) para mlist + docLeaf start = \case + DocLink link -> wrap start "Link" $ docEmbedLink (ann link) link + DocNamedLink name target -> wrap start "NamedLink" $ name <> docLeaf (ann target) (vacuous target) + DocExample code -> wrap start "Example" code + DocTransclude code -> wrap start "Transclude" code + DocBold para -> wrap start "Bold" para + DocItalic para -> wrap start "Italic" para + DocStrikethrough para -> wrap start "Strikethrough" para + DocVerbatim word -> wrap start "Verbatim" . docLeaf (ann word) $ vacuous word + DocCode word -> wrap start "Code" . docLeaf (ann word) $ vacuous word + DocSource elems -> wrap start "Source" . concat $ (\e -> docSourceElement (ann e) e) <$> elems + DocFoldedSource elems -> wrap start "FoldedSource" . concat $ (\e -> docSourceElement (ann e) e) <$> elems + DocEvalInline code -> wrap start "EvalInline" code + DocSignature links -> wrap start "Signature" . concat $ (\l -> docEmbedSignatureLink (ann l) l) <$> links + DocSignatureInline link -> wrap start "SignatureInline" $ docEmbedSignatureLink (ann link) link + DocWord text -> wrap start "Word" . pure $ Textual <$> text + DocGroup (DocJoin leaves) -> + wrap start "Group" . wrap start "Join" . concat $ (\l -> docLeaf (ann l) l) <$> leaves + docEmbedLink start = \case + DocEmbedTypeLink ident -> wrap start "EmbedTypeLink" . pure $ identifierLexeme <$> ident + DocEmbedTermLink ident -> wrap start "EmbedTermLink" . pure $ identifierLexeme <$> ident + docSourceElement start (DocSourceElement link anns) = + wrap start "SourceElement" $ + docEmbedLink (ann link) link + <> maybe + [] + ((Token (Reserved "@") (Pos 0 0) (Pos 0 0) :) . concatMap (\a -> docEmbedAnnotation (ann a) a)) + (NonEmpty.nonEmpty anns) + docEmbedSignatureLink start (DocEmbedSignatureLink ident) = + wrap start "EmbedSignatureLink" . pure $ identifierLexeme <$> ident + docEmbedAnnotation start (DocEmbedAnnotation a) = + wrap start "EmbedAnnotation" $ either (pure . fmap identifierLexeme) (\l -> docLeaf (ann l) $ vacuous l) a + -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). -docBody :: P [Token Lexeme] -docBody = join <$> P.many (sectionElem <* CP.space) +docBody :: P end -> P (DocUntitledSection DocTree) +docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) where wordyKw kw = separated wordySep (lit kw) sectionElem = section <|> fencedBlock <|> list <|> paragraph - paragraph = wrap "syntax.docParagraph" $ join <$> spaced leaf + paragraph = wrap' . DocParagraph <$> spaced leaf reserved word = List.isPrefixOf "}}" word || all (== '#') word - wordy closing = wrap "syntax.docWord" . tok . fmap Textual . P.try $ do + wordy :: P end -> P (DocLeaf void) + wordy closing = fmap DocWord . tokenP . P.try $ do let end = P.lookAhead $ void docClose @@ -520,65 +735,61 @@ docBody = join <$> P.many (sectionElem <* CP.space) where comma = lit "," <* CP.space src = - src' "syntax.docSource" "@source" - <|> src' "syntax.docFoldedSource" "@foldedSource" + src' DocSource "@source" + <|> src' DocFoldedSource "@foldedSource" srcElem = - wrap "syntax.docSourceElement" $ - (typeLink <|> termLink) - <+> ( fmap (fromMaybe []) . P.optional $ - (tok (Reserved <$> lit "@") <+> (CP.space *> annotations)) - ) + DocSourceElement + <$> (typeLink <|> termLink) + <*> ( fmap (fromMaybe []) . P.optional $ + (lit "@") *> (CP.space *> annotations) + ) where - annotation = tok identifierLexemeP <|> expr <* CP.space + annotation = fmap Left (tokenP identifierP) <|> fmap Right expr <* CP.space annotations = - join <$> P.some (wrap "syntax.docEmbedAnnotation" annotation) - src' name atName = wrap name $ do + P.some (DocEmbedAnnotation <$> annotation) + src' name atName = fmap name $ do _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space - s <- P.sepBy1 srcElem comma + s <- sepBy1' srcElem comma _ <- lit "}" - pure (join s) - signature = wrap "syntax.docSignature" $ do + pure s + signature = fmap DocSignature $ do _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space - s <- join <$> P.sepBy1 signatureLink comma + s <- sepBy1' signatureLink comma _ <- lit "}" pure s - signatureInline = wrap "syntax.docSignatureInline" $ do + signatureInline = fmap DocSignatureInline $ do _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space s <- signatureLink _ <- lit "}" pure s - evalInline = wrap "syntax.docEvalInline" $ do + evalInline = fmap DocEvalInline $ do _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space let inlineEvalClose = [] <$ lit "}" s <- lexemes' inlineEvalClose pure s - typeLink = wrap "syntax.docEmbedTypeLink" do + typeLink = fmap DocEmbedTypeLink $ do _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tok identifierLexemeP <* CP.space + tokenP identifierP <* CP.space termLink = - wrap "syntax.docEmbedTermLink" $ - tok identifierLexemeP <* CP.space + fmap DocEmbedTermLink $ + tokenP identifierP <* CP.space signatureLink = - wrap "syntax.docEmbedSignatureLink" $ - tok identifierLexemeP <* CP.space + fmap DocEmbedSignatureLink $ + tokenP identifierP <* CP.space groupy closing p = do - Token p start stop <- tokenP p + Token p _ _ <- tokenP p after <- P.optional . P.try $ leafy closing pure $ case after of Nothing -> p Just after -> - [ Token (Open "syntax.docGroup") start stop', - Token (Open "syntax.docJoin") start stop' - ] - <> p - <> after - <> (take 2 $ repeat (Token Close stop' stop')) - where - stop' = maybe stop end (lastMay after) + DocGroup + . DocJoin + $ p + :| pure after verbatim = P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do @@ -595,17 +806,17 @@ docBody = join <$> P.many (sectionElem <* CP.space) let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed -- If it's a multi-line verbatim block we trim any whitespace representing -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - wrap "syntax.docVerbatim" $ - wrap "syntax.docWord" $ - pure [Token (Textual txt) start stop] + pure . DocVerbatim $ + DocWord $ + Token txt start stop else - wrap "syntax.docCode" $ - wrap "syntax.docWord" $ - pure [Token (Textual originalText) start stop] + pure . DocCode $ + DocWord $ + Token originalText start stop exampleInline = P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ - wrap "syntax.docExample" $ do + fmap DocExample $ do n <- P.try $ do _ <- lit "`" length <$> P.takeWhile1P (Just "backticks") (== '`') @@ -613,19 +824,19 @@ docBody = join <$> P.many (sectionElem <* CP.space) ex <- CP.space *> lexemes' end pure ex - docClose = [] <$ lit "}}" + docClose = [] <$ docClose' docOpen = [] <$ lit "{{" link = P.label "link (examples: {type List}, {Nat.+})" $ - wrap "syntax.docLink" $ + fmap DocLink $ P.try $ lit "{" *> (typeLink <|> termLink) <* lit "}" expr = - P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ + fmap DocTransclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ openAs "{{" "syntax.docTransclude" - <+> do + *> do env0 <- S.get -- we re-allow layout within a transclusion, then restore it to its -- previous state after @@ -637,7 +848,7 @@ docBody = join <$> P.many (sectionElem <* CP.space) ts <- lexemes' (P.lookAhead ([] <$ lit "}}")) S.modify (\env -> env {inLayout = inLayout env0}) pure ts - <+> close ["syntax.docTransclude"] (lit "}}") + <* close ["syntax.docTransclude"] (lit "}}") nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace @@ -654,7 +865,7 @@ docBody = join <$> P.many (sectionElem <* CP.space) P.label "block eval (syntax: a fenced code block)" $ evalUnison <|> exampleBlock <|> other where - evalUnison = wrap "syntax.docEval" $ do + evalUnison = fmap (wrap' . DocEval) $ do -- commit after seeing that ``` is on its own line fence <- P.try $ do fence <- lit "```" <+> P.takeWhileP Nothing (== '`') @@ -665,7 +876,7 @@ docBody = join <$> P.many (sectionElem <* CP.space) (\env -> env {inLayout = True, opening = Just "docEval"}) (restoreStack "docEval" $ lexemes' ([] <$ lit fence)) - exampleBlock = wrap "syntax.docExampleBlock" $ do + exampleBlock = fmap (wrap' . DocExampleBlock) $ do void $ lit "@typecheck" <* CP.space fence <- lit "```" <+> P.takeWhileP Nothing (== '`') local @@ -682,20 +893,20 @@ docBody = join <$> P.many (sectionElem <* CP.space) skip _ s = s in List.intercalate "\n" $ skip column <$> lines s - other = wrap "syntax.docCodeBlock" $ do + other = fmap (uncurry $ wrapSimple2 DocCodeBlock) $ do column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel let tabWidth = toInteger . P.unPos $ P.defaultTabWidth fence <- lit "```" <+> P.takeWhileP Nothing (== '`') name <- P.takeWhileP Nothing nonNewlineSpace - *> tok (Textual <$> P.takeWhile1P Nothing (not . isSpace)) + *> tokenP (P.takeWhile1P Nothing (not . isSpace)) <* P.takeWhileP Nothing nonNewlineSpace _ <- void CP.eol verbatim <- - tok $ - Textual . uncolumn column tabWidth . trimAroundDelimiters + tokenP $ + uncolumn column tabWidth . trimAroundDelimiters <$> P.someTill P.anySingle ([] <$ lit fence) - pure (name <> verbatim) + pure (name, verbatim) boldOrItalicOrStrikethrough closing = do let start = @@ -705,30 +916,29 @@ docBody = join <$> P.many (sectionElem <* CP.space) (P.satisfy (== '~')) name s = if take 1 s == "~" - then "syntax.docStrikethrough" - else if take 1 s == "*" then "syntax.docBold" else "syntax.docItalic" + then DocStrikethrough + else if take 1 s == "*" then DocBold else DocItalic end <- P.try $ do end <- start P.lookAhead (P.satisfy (not . isSpace)) pure end - wrap (name end) . wrap "syntax.docParagraph" $ - join - <$> P.someTill - (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) - (lit end) + name end . wrap' . DocParagraph + <$> someTill' + (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) + (lit end) externalLink = P.label "hyperlink (example: [link name](https://destination.com))" $ - wrap "syntax.docNamedLink" $ do + fmap (uncurry DocNamedLink) $ do _ <- lit "[" p <- leafies (void $ char ']') _ <- lit "]" _ <- lit "(" target <- - wrap "syntax.docGroup" . wrap "syntax.docJoin" $ - link <|> fmap join (P.some (expr <|> wordy (char ')'))) + fmap (DocGroup . DocJoin) $ + fmap pure link <|> some' (expr <|> wordy (char ')')) _ <- lit ")" - pure (p <> target) + pure (p, target) -- newline = P.optional (lit "\r") *> lit "\n" @@ -742,15 +952,15 @@ docBody = join <$> P.many (sectionElem <* CP.space) where ok s = length [() | '\n' <- s] < 2 - spaced p = P.some (p <* P.optional sp) - leafies close = wrap "syntax.docParagraph" $ join <$> spaced (leafy close) + spaced p = some' (p <* P.optional sp) + leafies close = wrap' . DocParagraph <$> spaced (leafy close) list = bulletedList <|> numberedList - bulletedList = wrap "syntax.docBulletedList" $ join <$> P.sepBy1 bullet listSep - numberedList = wrap "syntax.docNumberedList" $ join <$> P.sepBy1 numberedItem listSep + bulletedList = wrap' . DocBulletedList <$> sepBy1' bullet listSep + numberedList = wrap' . DocNumberedList <$> sepBy1' numberedItem listSep - listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (bulletedStart <|> numberedStart) + listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) bulletedStart = P.try $ do r <- listItemStart' $ [] <$ P.satisfy bulletChar @@ -759,6 +969,7 @@ docBody = join <$> P.many (sectionElem <* CP.space) where bulletChar ch = ch == '*' || ch == '-' || ch == '+' + listItemStart' :: P a -> P (Int, a) listItemStart' gutter = P.try $ do nonNewlineSpaces col <- column <$> posP @@ -767,14 +978,11 @@ docBody = join <$> P.many (sectionElem <* CP.space) (col,) <$> gutter numberedStart = - listItemStart' $ P.try (tok . fmap num $ LP.decimal <* lit ".") - where - num :: Word -> Lexeme - num n = Numeric (show n) + listItemStart' $ P.try (tokenP $ LP.decimal <* lit ".") - listItemParagraph = wrap "syntax.docParagraph" $ do + listItemParagraph = fmap (wrap' . DocParagraph) $ do col <- column <$> posP - join <$> P.some (leaf <* sep col) + some' (leaf <* sep col) where -- Trickiness here to support hard line breaks inside of -- a bulleted list, so for instance this parses as expected: @@ -792,29 +1000,29 @@ docBody = join <$> P.many (sectionElem <* CP.space) *> do col2 <- column <$> posP guard $ col2 >= col - (P.notFollowedBy $ numberedStart <|> bulletedStart) + (P.notFollowedBy $ void numberedStart <|> void bulletedStart) pure () numberedItem = P.label msg $ do (col, s) <- numberedStart - pure s - <+> ( wrap "syntax.docColumn" $ do + (s,) + <$> ( fmap (uncurry DocColumn) $ do p <- nonNewlineSpaces *> listItemParagraph subList <- local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) - pure (p <> fromMaybe [] subList) + pure (p, subList) ) where msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" - bullet = wrap "syntax.docColumn" . P.label "bullet (examples: * item1, - item2)" $ do + bullet = fmap (uncurry DocColumn) . P.label "bullet (examples: * item1, - item2)" $ do (col, _) <- bulletedStart p <- nonNewlineSpaces *> listItemParagraph subList <- local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) - pure (p <> fromMaybe [] subList) + pure (p, subList) newline = P.label "newline" $ lit "\n" <|> lit "\r\n" @@ -828,8 +1036,8 @@ docBody = join <$> P.many (sectionElem <* CP.space) -- A paragraph under this subsection. -- # A section title (not a subsection) - section :: P [Token Lexeme] - section = wrap "syntax.docSection" $ do + section :: P DocTree + section = fmap (wrap' . uncurry DocSection) $ do ns <- S.gets parentSections hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp title <- paragraph <* CP.space @@ -837,19 +1045,13 @@ docBody = join <$> P.many (sectionElem <* CP.space) body <- local (\env -> env {parentSections = (m : (tail ns))}) $ P.many (sectionElem <* CP.space) - pure $ title <> join body + pure $ (title, body) - wrap :: String -> P [Token Lexeme] -> P [Token Lexeme] - wrap o p = do - start <- posP - lexemes <- p - pure $ go start lexemes - where - go start [] = [Token (Open o) start start, Token Close start start] - go start ts@(Token _ x _ : _) = - Token (Open o) start x : (ts ++ [Token Close (end final) (end final)]) - where - final = last ts + wrap' :: DocTop DocTree -> DocTree + wrap' doc = ann doc :< doc + + wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> DocTop DocTree) -> a -> b -> DocTree + wrapSimple2 fn a b = ann a <> ann b :< fn a b lexemes' :: P [Token Lexeme] -> P [Token Lexeme] lexemes' eof = @@ -1289,12 +1491,13 @@ identifierP = do -- .foo.++.doc -- `.`.`..` (This is a two-segment identifier without a leading dot: "." then "..") identifierLexemeP :: P Lexeme -identifierLexemeP = do - name <- identifierP - pure - if Name.isSymboly (HQ'.toName name) - then SymbolyId name - else WordyId name +identifierLexemeP = identifierLexeme <$> identifierP + +identifierLexeme :: HQ'.HashQualified Name -> Lexeme +identifierLexeme name = + if Name.isSymboly (HQ'.toName name) + then SymbolyId name + else WordyId name wordyIdSegP :: P NameSegment wordyIdSegP = diff --git a/unison-syntax/unison-syntax.cabal b/unison-syntax/unison-syntax.cabal index 888982134f..4b097e6021 100644 --- a/unison-syntax/unison-syntax.cabal +++ b/unison-syntax/unison-syntax.cabal @@ -69,6 +69,7 @@ library , containers , cryptonite , extra + , free , lens , megaparsec , mtl @@ -127,6 +128,7 @@ test-suite syntax-tests , cryptonite , easytest , extra + , free , lens , megaparsec , mtl From 227ff27cea21af3a636bb94d0ccadc80fcd37d9f Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Fri, 5 Jul 2024 14:15:05 -0600 Subject: [PATCH 16/50] =?UTF-8?q?Don=E2=80=99t=20=E2=80=9Cun-parse?= =?UTF-8?q?=E2=80=9D=20`Doc`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This removes the layer that makes the `Doc` parser look like a lexer and replaces it with a function that converts the Doc structure directly Unison Terms. --- .../src/Unison/Syntax/TermParser.hs | 301 +++++++++--------- unison-syntax/src/Unison/Syntax/Lexer.hs | 103 ++---- unison-syntax/src/Unison/Syntax/Parser.hs | 9 +- 3 files changed, 178 insertions(+), 235 deletions(-) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 635a974d89..1f1dda24c1 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -12,6 +12,7 @@ module Unison.Syntax.TermParser ) where +import Control.Comonad.Cofree (Cofree ((:<))) import Control.Monad.Reader (asks, local) import Data.Char qualified as Char import Data.Foldable (foldrM) @@ -24,6 +25,7 @@ import Data.Sequence qualified as Sequence import Data.Set qualified as Set import Data.Text qualified as Text import Data.Tuple.Extra qualified as TupleE +import Data.Void (vacuous) import Text.Megaparsec qualified as P import U.Core.ABT qualified as ABT import Unison.ABT qualified as ABT @@ -38,7 +40,7 @@ import Unison.NameSegment qualified as NameSegment import Unison.Names (Names) import Unison.Names qualified as Names import Unison.NamesWithHistory qualified as Names -import Unison.Parser.Ann (Ann) +import Unison.Parser.Ann (Ann (Ann)) import Unison.Parser.Ann qualified as Ann import Unison.Pattern (Pattern) import Unison.Pattern qualified as Pattern @@ -113,8 +115,10 @@ rewriteBlock = do pure (DD.rewriteType (ann kw <> ann rhs) (L.payload <$> vs) lhs rhs) typeLink' :: (Monad m, Var v) => P v m (L.Token Reference) -typeLink' = do - id <- hqPrefixId +typeLink' = findUniqueType =<< hqPrefixId + +findUniqueType :: (Monad m, Var v) => L.Token (HQ.HashQualified Name) -> P v m (L.Token Reference) +findUniqueType id = do ns <- asks names case Names.lookupHQType Names.IncludeSuffixes (L.payload id) ns of s @@ -434,7 +438,7 @@ resolveHashQualified tok = do names <- asks names case L.payload tok of HQ.NameOnly n -> pure $ Term.var (ann tok) (Name.toVar n) - _ -> case Names.lookupHQTerm Names.IncludeSuffixes (L.payload tok) names of + hqn -> case Names.lookupHQTerm Names.IncludeSuffixes hqn names of s | Set.null s -> failCommitted $ UnknownTerm tok s | Set.size s > 1 -> failCommitted $ UnknownTerm tok s @@ -461,160 +465,155 @@ termLeaf = doc2Block <&> \(spanAnn, trm) -> trm {ABT.annotation = ABT.annotation trm <> spanAnn} ] --- Syntax for documentation v2 blocks, which are surrounded by {{ }}. +-- | Gives a parser an explicit stream to parse, so that it consumes nothing from the original stream when it runs. +-- +-- This is used inside the `Doc` -> `Term` conversion, where we have chunks of Unison code embedded that need to be +-- parsed. It’s a consequence of parsing Doc in the midst of the Unison lexer. +subParse :: (Ord v, Monad m) => P v m a -> [L.Token L.Lexeme] -> P v m a +subParse p toks = do + orig <- P.getInput + P.setInput $ Input toks + result <- p <* P.eof + P.setInput orig + pure result + +-- | Syntax for documentation v2 blocks, which are surrounded by @{{@ @}}@. -- The lexer does most of the heavy lifting so there's not a lot for -- the parser to do. For instance, in -- --- {{ --- Hi there! --- --- goodbye. --- }} +-- > {{ +-- > Hi there! +-- > +-- > goodbye. +-- > }} -- -- the lexer will produce: -- --- [Open "syntax.docUntitledSection", --- Open "syntax.docParagraph", --- Open "syntax.docWord", Textual "Hi", Close, --- Open "syntax.docWord", Textual "there!", Close, --- Close --- Open "syntax.docParagraph", --- Open "syntax.docWord", Textual "goodbye", Close, --- Close --- Close] +-- > [ Doc +-- > ( DocUntitledSection +-- > (DocParagraph (DocWord "Hi" :| [DocWord "there!"])) +-- > (DocParagraph (DocWord "goodbye" :| [])) +-- > ) +-- > ] -- -- The parser will parse this into the Unison expression: -- --- syntax.docUntitledSection [ --- syntax.docParagraph [syntax.docWord "Hi", syntax.docWord "there!"], --- syntax.docParagraph [syntax.docWord "goodbye"] --- ] +-- > syntax.docUntitledSection [ +-- > syntax.docParagraph [syntax.docWord "Hi", syntax.docWord "there!"], +-- > syntax.docParagraph [syntax.docWord "goodbye"] +-- > ] -- --- Where `syntax.doc{Paragraph, UntitledSection,...}` are all ordinary term +-- Where @syntax.doc{Paragraph, UntitledSection,...}@ are all ordinary term -- variables that will be looked up in the environment like anything else. This -- means that the documentation syntax can have its meaning changed by --- overriding what functions the names `syntax.doc*` correspond to. +-- overriding what functions the names @syntax.doc*@ correspond to. doc2Block :: forall m v. (Monad m, Var v) => P v m (Ann {- Annotation for the whole spanning block -}, Term v Ann) doc2Block = do - P.lookAhead (openBlockWith "syntax.docUntitledSection") *> elem + L.Token docContents startDoc endDoc <- doc + let docAnn = Ann startDoc endDoc + (docAnn,) . docUntitledSection (gann docAnn) <$> traverse (cata $ docTop <=< sequenceA) docContents where - -- For terms which aren't blocks the spanning annotation is the same as the - -- term annotation. - selfAnnotated :: Term v Ann -> (Ann, Term v Ann) - selfAnnotated t = (ann t, t) - elem :: P v m (Ann {- Annotation for the whole spanning block -}, Term v Ann) - elem = - (selfAnnotated <$> text) <|> do - startTok <- openBlock - let -- here, `t` will be something like `Open "syntax.docWord"` - -- so `f` will be a term var with the name "syntax.docWord". - f = f' startTok - f' t = Term.var (ann t) (Var.nameds (L.payload t)) - - -- follows are some common syntactic forms used for parsing child elements - - -- regular is parsed into `f child1 child2 child3` for however many children - regular = do - cs <- P.many (snd <$> elem) - endTok <- closeBlock - let trm = Term.apps' f cs - pure (ann startTok <> ann endTok, trm) - - -- variadic is parsed into: `f [child1, child2, ...]` - variadic = variadic' f - variadic' f = do - cs <- P.many (snd <$> elem) - endTok <- closeBlock - let trm = Term.apps' f [Term.list (ann cs) cs] - pure (ann startTok <> ann endTok, trm) - - -- sectionLike is parsed into: `f tm [child1, child2, ...]` - sectionLike = do - arg1 <- (snd <$> elem) - cs <- P.many (snd <$> elem) - endTok <- closeBlock - let trm = Term.apps' f [arg1, Term.list (ann cs) cs] - pure (ann startTok <> ann endTok, trm) - - evalLike wrap = do - tm <- term - endTok <- closeBlock - let trm = Term.apps' f [wrap tm] - pure (ann startTok <> ann endTok, trm) - - -- converts `tm` to `'tm` - -- - -- Embedded examples like ``1 + 1`` are represented as terms, - -- but are wrapped in delays so they are left unevaluated for the - -- code which renders documents. (We want the doc display to get - -- the unevaluated expression `1 + 1` and not `2`) - addDelay tm = Term.delay (ann tm) tm - case L.payload startTok of - "syntax.docJoin" -> variadic - "syntax.docUntitledSection" -> variadic - "syntax.docColumn" -> variadic - "syntax.docParagraph" -> variadic - "syntax.docSignature" -> variadic - "syntax.docSource" -> variadic - "syntax.docFoldedSource" -> variadic - "syntax.docBulletedList" -> variadic - "syntax.docSourceAnnotations" -> variadic - "syntax.docSourceElement" -> do - link <- (snd <$> elem) - anns <- P.optional $ reserved "@" *> (snd <$> elem) - endTok <- closeBlock - let trm = Term.apps' f [link, fromMaybe (Term.list (ann link) mempty) anns] - pure (ann startTok <> ann endTok, trm) - "syntax.docNumberedList" -> do - nitems@((n, _) : _) <- P.some nitem - endTok <- closeBlock - let items = snd <$> nitems - let trm = Term.apps' f [n, Term.list (ann items) items] - pure (ann startTok <> ann endTok, trm) - where - nitem = do - n <- number - t <- openBlockWith "syntax.docColumn" - let f = f' ("syntax.docColumn" <$ t) - (_spanAnn, child) <- variadic' f - pure (n, child) - "syntax.docSection" -> sectionLike - -- @source{ type Blah, foo, type Bar } - "syntax.docEmbedTermLink" -> do - tm <- addDelay <$> (hashQualifiedPrefixTerm <|> hashQualifiedInfixTerm) - endTok <- closeBlock - let trm = Term.apps' f [tm] - pure (ann startTok <> ann endTok, trm) - "syntax.docEmbedSignatureLink" -> do - tm <- addDelay <$> (hashQualifiedPrefixTerm <|> hashQualifiedInfixTerm) - endTok <- closeBlock - let trm = Term.apps' f [tm] - pure (ann startTok <> ann endTok, trm) - "syntax.docEmbedTypeLink" -> do - r <- typeLink' - endTok <- closeBlock - let trm = Term.apps' f [Term.typeLink (ann r) (L.payload r)] - pure (ann startTok <> ann endTok, trm) - "syntax.docExample" -> do - trm <- term - endTok <- closeBlock - let spanAnn = ann startTok <> ann endTok - pure . (spanAnn,) $ case trm of - tm@(Term.Apps' _ xs) -> - let fvs = List.Extra.nubOrd $ concatMap (toList . Term.freeVars) xs - n = Term.nat (ann tm) (fromIntegral (length fvs)) - lam = addDelay $ Term.lam' (ann tm) ((Ann.GeneratedFrom spanAnn,) <$> fvs) tm - in Term.apps' f [n, lam] - tm -> Term.apps' f [Term.nat (ann tm) 0, addDelay tm] - "syntax.docTransclude" -> evalLike id - "syntax.docEvalInline" -> evalLike addDelay - "syntax.docExampleBlock" -> do - (spanAnn, tm) <- block'' False True "syntax.docExampleBlock" (pure (void startTok)) closeBlock - pure $ (spanAnn, Term.apps' f [Term.nat (ann tm) 0, addDelay tm]) - "syntax.docEval" -> do - (spanAnn, tm) <- block' False "syntax.docEval" (pure (void startTok)) closeBlock - pure $ (spanAnn, Term.apps' f [addDelay tm]) - _ -> regular + cata :: (Functor f) => (f a -> a) -> Cofree f x -> a + cata fn (_ :< fx) = fn $ cata fn <$> fx + + gann :: (Annotated a) => a -> Ann + gann = Ann.GeneratedFrom . ann + + addDelay :: Term v Ann -> Term v Ann + addDelay tm = Term.delay (ann tm) tm + + f :: (Annotated a) => a -> String -> Term v Ann + f a = Term.var (gann a) . Var.nameds . ("syntax.doc" <>) + + docUntitledSection :: Ann -> L.DocUntitledSection (Term v Ann) -> Term v Ann + docUntitledSection ann (L.DocUntitledSection tops) = + Term.app ann (f ann "UntitledSection") $ Term.list (gann tops) tops + + docTop :: L.DocTop (Term v Ann) -> TermP v m + docTop d = case d of + L.DocSection title body -> pure $ Term.apps' (f d "Section") [title, Term.list (gann body) body] + L.DocEval code -> + Term.app (gann d) (f d "Eval") . addDelay . snd + <$> subParse (block' False False "syntax.docEval" (pure $ pure ()) $ Ann.External <$ P.eof) code + L.DocExampleBlock code -> + Term.apps' (f d "ExampleBlock") . (Term.nat (gann d) 0 :) . pure . addDelay . snd + <$> subParse (block' False True "syntax.docExampleBlock" (pure $ pure ()) $ Ann.External <$ P.eof) code + L.DocCodeBlock label body -> + pure $ + Term.apps' + (f d "CodeBlock") + [Term.text (ann label) . Text.pack $ L.payload label, Term.text (ann body) . Text.pack $ L.payload body] + L.DocBulletedList items -> + pure $ Term.app (gann d) (f d "BulletedList") . Term.list (gann items) . toList $ docColumn <$> items + L.DocNumberedList items@((n, _) :| _) -> + pure $ + Term.apps' + (f d "NumberedList") + [Term.nat (ann d) $ L.payload n, Term.list (gann $ snd <$> items) . toList $ docColumn . snd <$> items] + L.DocParagraph leaves -> + Term.app (gann d) (f d "Paragraph") . Term.list (ann leaves) . toList <$> traverse docLeaf leaves + + docColumn :: L.DocColumn (Term v Ann) -> Term v Ann + docColumn d@(L.DocColumn para sublist) = + Term.app (gann d) (f d "Column") . Term.list (gann d) $ para : toList sublist + + docLeaf :: L.DocLeaf (Term v Ann) -> TermP v m + docLeaf d = case d of + L.DocLink link -> Term.app (gann d) (f d "Link") <$> docEmbedLink link + L.DocNamedLink para target -> Term.apps' (f d "NamedLink") . (para :) . pure <$> docLeaf (vacuous target) + L.DocExample code -> do + trm <- subParse term code + pure . Term.apps' (f d "Example") $ case trm of + tm@(Term.Apps' _ xs) -> + let fvs = List.Extra.nubOrd $ concatMap (toList . Term.freeVars) xs + n = Term.nat (ann tm) (fromIntegral (length fvs)) + lam = addDelay $ Term.lam' (ann tm) ((mempty,) <$> fvs) tm + in [n, lam] + tm -> [Term.nat (ann tm) 0, addDelay tm] + L.DocTransclude code -> Term.app (gann d) (f d "Transclude") <$> subParse term code + L.DocBold para -> pure $ Term.app (gann d) (f d "Bold") para + L.DocItalic para -> pure $ Term.app (gann d) (f d "Italic") para + L.DocStrikethrough para -> pure $ Term.app (gann d) (f d "Strikethrough") para + L.DocVerbatim leaf -> Term.app (gann d) (f d "Verbatim") <$> docLeaf (vacuous leaf) + L.DocCode leaf -> Term.app (gann d) (f d "Code") <$> docLeaf (vacuous leaf) + L.DocSource elems -> + Term.app (gann d) (f d "Source") . Term.list (ann elems) . toList <$> traverse docSourceElement elems + L.DocFoldedSource elems -> + Term.app (gann d) (f d "FoldedSource") . Term.list (ann elems) . toList <$> traverse docSourceElement elems + L.DocEvalInline code -> Term.app (gann d) (f d "EvalInline") . addDelay <$> subParse term code + L.DocSignature links -> + Term.app (gann d) (f d "Signature") . Term.list (ann links) . toList <$> traverse docEmbedSignatureLink links + L.DocSignatureInline link -> Term.app (gann d) (f d "SignatureInline") <$> docEmbedSignatureLink link + L.DocWord txt -> pure . Term.app (gann d) (f d "Word") . Term.text (ann txt) . Text.pack $ L.payload txt + L.DocGroup (L.DocJoin leaves) -> + Term.app (gann d) (f d "Group") . Term.app (gann d) (f d "Join") . Term.list (ann leaves) . toList + <$> traverse docLeaf leaves + + docEmbedLink :: L.DocEmbedLink -> TermP v m + docEmbedLink d = case d of + L.DocEmbedTypeLink ident -> + Term.app (gann d) (f d "EmbedTypeLink") . Term.typeLink (ann d) . L.payload + <$> findUniqueType (HQ'.toHQ <$> ident) + L.DocEmbedTermLink ident -> + Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) + + docSourceElement :: L.DocSourceElement -> TermP v m + docSourceElement d@(L.DocSourceElement link anns) = do + link' <- docEmbedLink link + anns' <- traverse docEmbedAnnotation anns + pure $ Term.apps' (f d "SourceElement") [link', Term.list (ann anns) anns'] + + docEmbedSignatureLink :: L.DocEmbedSignatureLink -> TermP v m + docEmbedSignatureLink d@(L.DocEmbedSignatureLink ident) = + Term.app (gann d) (f d "EmbedSignatureLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) + + docEmbedAnnotation :: L.DocEmbedAnnotation -> TermP v m + docEmbedAnnotation d@(L.DocEmbedAnnotation a) = + -- This is the only place I’m not sure we’re doing the right thing. In the lexer, this can be an identifier or a + -- DocLeaf, but here it could be either /text/ or a Doc element. And I don’t think there’s any way the lexemes + -- produced for an identifier and the lexemes consumed for text line up. So, I think this is a bugfix I can’t + -- avoid. + Term.app (gann d) (f d "EmbedAnnotation") <$> either (resolveHashQualified . fmap HQ'.toHQ) (docLeaf . vacuous) a docBlock :: (Monad m, Var v) => TermP v m docBlock = do @@ -1143,7 +1142,7 @@ customFailure :: (P.MonadParsec e s m) => e -> m a customFailure = P.customFailure block :: forall m v. (Monad m, Var v) => String -> P v m (Ann, Term v Ann) -block s = block' False s (openBlockWith s) closeBlock +block s = block' False False s (openBlockWith s) closeBlock -- example: use Foo.bar.Baz + ++ x -- + ++ and x are called the "suffixes" of the `use` statement, and @@ -1213,24 +1212,16 @@ substImports ns imports = ] block' :: - (Monad m, Var v) => - IsTop -> - String -> - P v m (L.Token ()) -> - P v m (L.Token ()) -> - P v m (Ann {- ann which spans the whole block -}, Term v Ann) -block' isTop = block'' isTop False - -block'' :: forall m v end. (Monad m, Var v, Annotated end) => IsTop -> - Bool -> -- `True` means insert `()` at end of block if it ends with a statement + -- | `True` means insert `()` at end of block if it ends with a statement + Bool -> String -> P v m (L.Token ()) -> P v m end -> P v m (Ann {- ann which spans the whole block -}, Term v Ann) -block'' isTop implicitUnitAtEnd s openBlock closeBlock = do +block' isTop implicitUnitAtEnd s openBlock closeBlock = do open <- openBlock (names, imports) <- imports _ <- optional semi diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index fd27118050..3c5041cf6f 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -42,6 +42,7 @@ import Control.Comonad.Cofree (Cofree ((:<))) import Control.Monad.State qualified as S import Data.Char (isAlphaNum, isControl, isDigit, isSpace, ord, toLower) import Data.Foldable qualified as Foldable +import Data.Functor.Classes import Data.List qualified as List import Data.List.Extra qualified as List import Data.List.NonEmpty (NonEmpty ((:|))) @@ -50,7 +51,6 @@ import Data.List.NonEmpty qualified as NonEmpty import Data.Map.Strict qualified as Map import Data.Set qualified as Set import Data.Text qualified as Text -import Data.Void (vacuous) import GHC.Exts (sortWith) import Text.Megaparsec qualified as P import Text.Megaparsec.Char (char) @@ -65,7 +65,7 @@ import Unison.Name qualified as Name import Unison.NameSegment (NameSegment) import Unison.NameSegment qualified as NameSegment (docSegment) import Unison.NameSegment.Internal qualified as NameSegment -import Unison.Parser.Ann (Ann (Ann, GeneratedFrom), Annotated (..)) +import Unison.Parser.Ann (Ann (Ann), Annotated (..)) import Unison.Prelude import Unison.ShortHash (ShortHash) import Unison.ShortHash qualified as SH @@ -158,6 +158,7 @@ data Lexeme | Bytes Bytes.Bytes -- bytes literals | Hash ShortHash -- hash literals | Err Err + | Doc (DocUntitledSection DocTree) deriving stock (Eq, Show, Ord) type IsVirtual = Bool -- is it a virtual semi or an actual semi? @@ -389,6 +390,7 @@ displayLexeme = \case Bytes _b -> "bytes literal" Hash h -> Text.unpack (SH.toText h) Err e -> show e + Doc _ -> "doc structure" infixl 2 <+> @@ -436,7 +438,7 @@ doc2 = do CP.space env0 <- S.get -- Disable layout while parsing the doc block and reset the section number - (docToks, closeTok) <- local + (docTok, closeTok) <- local ( \env -> env { inLayout = False, @@ -444,16 +446,18 @@ doc2 = do } ) do - bodyToks <- docBody (lit "}}") + body <- docBody (lit "}}") closeStart <- posP lit "}}" closeEnd <- posP - pure (docToLexemes (openStart, closeEnd) bodyToks, Token Close closeStart closeEnd) + pure (Token (Doc body) openStart closeEnd, Token Close closeStart closeEnd) -- Parse any layout tokens after the doc block, e.g. virtual semicolon endToks <- token' ignore (pure ()) -- Hack to allow anonymous doc blocks before type decls -- {{ Some docs }} Foo.doc = {{ Some docs }} -- ability Foo where => ability Foo where + -- + -- __FIXME__: This should be done _after_ parsing, not in lexing. tn <- subsequentTypeName pure $ beforeStartToks <> case (tn) of @@ -462,12 +466,13 @@ doc2 = do | isTopLevel -> Token (WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment))) openStart openEnd : Token (Open "=") openStart openEnd - : docToks - -- We need an extra 'Close' here because we added an extra Open above. - <> (closeTok : endToks) + : docTok + -- We need an extra 'Close' here because we added an extra Open above. + : closeTok + : endToks where isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 - _ -> docToks <> endToks + _ -> docTok : endToks where -- DUPLICATED wordyKw kw = separated wordySep (lit kw) @@ -527,6 +532,15 @@ data DocTop a | DocParagraph (NonEmpty (DocLeaf a)) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) +instance Eq1 DocTop where + liftEq _ _ _ = True + +instance Ord1 DocTop where + liftCompare _ _ _ = LT + +instance Show1 DocTop where + liftShowsPrec _ _ _ _ x = x + data DocColumn a = -- | The first is always a Paragraph, and the second a Bulleted or Numbered List DocColumn a (Maybe a) @@ -625,76 +639,6 @@ instance Annotated DocEmbedSignatureLink where instance Annotated DocEmbedAnnotation where ann (DocEmbedAnnotation a) = either ann ann a --- | This is a short-term hack to turn our parse tree back into the sequence of lexemes the current parser expects. --- --- The medium-term solution is to preserve @[`DocTree`]@ as its own lexeme type, and hand it to the parser without --- flattening it back to tokens. Longer-term, maybe we add a real lexer for @Doc@, and then whatever is left of this --- parser moves into the actual parser. -docToLexemes :: (Pos, Pos) -> DocUntitledSection DocTree -> [Token Lexeme] -docToLexemes (startDoc, endDoc) (DocUntitledSection tops) = - Token (Open "syntax.docUntitledSection") startDoc startDoc - : concatMap cata tops <> pure (Token Close endDoc endDoc) - where - wrap :: Ann -> String -> [Token Lexeme] -> [Token Lexeme] - wrap ann suffix lexemes = go (extractStart ann) lexemes - where - extractStart = \case - Ann start _ -> start - GeneratedFrom a -> extractStart a - a -> error $ "expected a good Pos! Got: " <> show a - o = "syntax.doc" <> suffix - go start [] = [Token (Open o) start start, Token Close start start] - go start ts@(Token _ x _ : _) = - Token (Open o) start x : (ts ++ [Token Close (end final) (end final)]) - where - final = last ts - cata :: DocTree -> [Token Lexeme] - cata (a :< top) = docTop a $ cata <$> top - docTop start = \case - DocSection title body -> wrap start "Section" $ title <> join body - DocEval code -> wrap start "Eval" code - DocExampleBlock code -> wrap start "ExampleBlock" code - DocCodeBlock label text -> wrap start "CodeBlock" [Textual <$> label, Textual <$> text] - DocBulletedList items -> wrap start "BulletedList" . concat $ (\col -> docColumn (ann col) col) <$> items - DocNumberedList items -> - wrap start "NumberedList" . concat $ - uncurry (:) . bimap (Numeric . show <$>) (\col -> docColumn (ann col) col) - <$> items - DocParagraph body -> wrap start "Paragraph" . concat $ (\l -> docLeaf (ann l) l) <$> body - docColumn start (DocColumn para mlist) = wrap start "Column" $ foldr (flip (<>)) para mlist - docLeaf start = \case - DocLink link -> wrap start "Link" $ docEmbedLink (ann link) link - DocNamedLink name target -> wrap start "NamedLink" $ name <> docLeaf (ann target) (vacuous target) - DocExample code -> wrap start "Example" code - DocTransclude code -> wrap start "Transclude" code - DocBold para -> wrap start "Bold" para - DocItalic para -> wrap start "Italic" para - DocStrikethrough para -> wrap start "Strikethrough" para - DocVerbatim word -> wrap start "Verbatim" . docLeaf (ann word) $ vacuous word - DocCode word -> wrap start "Code" . docLeaf (ann word) $ vacuous word - DocSource elems -> wrap start "Source" . concat $ (\e -> docSourceElement (ann e) e) <$> elems - DocFoldedSource elems -> wrap start "FoldedSource" . concat $ (\e -> docSourceElement (ann e) e) <$> elems - DocEvalInline code -> wrap start "EvalInline" code - DocSignature links -> wrap start "Signature" . concat $ (\l -> docEmbedSignatureLink (ann l) l) <$> links - DocSignatureInline link -> wrap start "SignatureInline" $ docEmbedSignatureLink (ann link) link - DocWord text -> wrap start "Word" . pure $ Textual <$> text - DocGroup (DocJoin leaves) -> - wrap start "Group" . wrap start "Join" . concat $ (\l -> docLeaf (ann l) l) <$> leaves - docEmbedLink start = \case - DocEmbedTypeLink ident -> wrap start "EmbedTypeLink" . pure $ identifierLexeme <$> ident - DocEmbedTermLink ident -> wrap start "EmbedTermLink" . pure $ identifierLexeme <$> ident - docSourceElement start (DocSourceElement link anns) = - wrap start "SourceElement" $ - docEmbedLink (ann link) link - <> maybe - [] - ((Token (Reserved "@") (Pos 0 0) (Pos 0 0) :) . concatMap (\a -> docEmbedAnnotation (ann a) a)) - (NonEmpty.nonEmpty anns) - docEmbedSignatureLink start (DocEmbedSignatureLink ident) = - wrap start "EmbedSignatureLink" . pure $ identifierLexeme <$> ident - docEmbedAnnotation start (DocEmbedAnnotation a) = - wrap start "EmbedAnnotation" $ either (pure . fmap identifierLexeme) (\l -> docLeaf (ann l) $ vacuous l) a - -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). docBody :: P end -> P (DocUntitledSection DocTree) @@ -1741,6 +1685,7 @@ instance P.VisualStream [Token Lexeme] where pretty Close = "" pretty (Semi True) = "" pretty (Semi False) = ";" + pretty (Doc d) = show d pad (Pos line1 col1) (Pos line2 col2) = if line1 == line2 then replicate (col2 - col1) ' ' diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 4945f4347e..733ecc93cf 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -5,7 +5,8 @@ module Unison.Syntax.Parser ( Annotated (..), Err, Error (..), - Input, + -- FIXME: Don’t export the data constructor + Input (..), P, ParsingEnv (..), UniqueName, @@ -16,6 +17,7 @@ module Unison.Syntax.Parser chainr1, character, closeBlock, + doc, failCommitted, failureIf, hqInfixId, @@ -393,6 +395,11 @@ string = queryToken getString getString (L.Textual s) = Just (Text.pack s) getString _ = Nothing +doc :: (Ord v) => P v m (L.Token (L.DocUntitledSection L.DocTree)) +doc = queryToken \case + L.Doc d -> pure d + _ -> Nothing + -- | Parses a tuple of 'a's, or a single parenthesized 'a' -- -- returns the result of combining elements with 'pair', alongside the annotation containing From 159ea3a433ad18a30af64984673fede127866def Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 22 Jul 2024 15:03:00 -0600 Subject: [PATCH 17/50] Extract `preParse` from `lexer` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After running the core of the lexer, the `lexer` function then does some work to turn the stream into a tree, and reorder some lexemes. It then throws away the tree structure. This is the first step of preserving the tree structure for the parser. It extracts the “pre-parser” from `lexer` so that it can eventually be used _after_ the lexer, rather than internally. This also moves `fixup` to be applied on each block as we reorder it, rather than across the entire stream at the end (since the goal is to not _have_ an entire stream any more). --- unison-syntax/src/Unison/Syntax/Lexer.hs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 3c5041cf6f..fd48d6abb3 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -1564,7 +1564,7 @@ stanzas = go [] -- Moves type and ability declarations to the front of the token stream -- and move `use` statements to the front of each block reorder :: [T (Token Lexeme)] -> [T (Token Lexeme)] -reorder = join . sortWith f . stanzas +reorder = foldr fixup [] . join . sortWith f . stanzas where f [] = 3 :: Int f (t0 : _) = case payload $ headToken t0 of @@ -1572,16 +1572,17 @@ reorder = join . sortWith f . stanzas Open typOrA | Set.member (Text.pack typOrA) typeOrAbility -> 1 Reserved "use" -> 0 _ -> 3 :: Int + -- after reordering can end up with trailing semicolon at the end of + -- a block, which we remove with this pass + fixup (payload . headToken -> Semi _) [] = [] + fixup tok tail = tok : tail + +-- | This turns the lexeme stream into a tree, reordering some lexeme subsequences. +preParse :: [Token Lexeme] -> T (Token Lexeme) +preParse = reorderTree reorder . tree lexer :: String -> String -> [Token Lexeme] -lexer scope rem = - let t = tree $ lexer0' scope rem - -- after reordering can end up with trailing semicolon at the end of - -- a block, which we remove with this pass - fixup ((payload -> Semi _) : t@(payload -> Close) : tl) = t : fixup tl - fixup [] = [] - fixup (h : t) = h : fixup t - in fixup . toList $ reorderTree reorder t +lexer scope = toList . preParse . lexer0' scope isDelayOrForce :: Char -> Bool isDelayOrForce op = op == '\'' || op == '!' From 32472bd9e01cb87ac310f375a199e44800c05b9a Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 24 Jul 2024 19:46:17 -0600 Subject: [PATCH 18/50] Allow EOF to close layout blocks This removes the need to pad the lexer stream with trailing `Close` lexemes. If EOF is reached, the parser will automatically close any layout blocks (but not context-free blocks). --- .../src/Unison/Syntax/TermParser.hs | 25 +++--- .../reparses-with-same-hash.u | 2 +- .../transcripts/error-messages.output.md | 2 + unison-syntax/src/Unison/Syntax/Lexer.hs | 87 ++++++++++--------- unison-syntax/src/Unison/Syntax/Parser.hs | 6 ++ 5 files changed, 71 insertions(+), 51 deletions(-) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 1f1dda24c1..999d5658ba 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -103,7 +103,7 @@ rewriteBlock = do rewriteTermlike kw mk = do kw <- quasikeyword kw lhs <- term - (_spanAnn, rhs) <- block "==>" + (_spanAnn, rhs) <- layoutBlock "==>" pure (mk (ann kw <> ann rhs) lhs rhs) rewriteTerm = rewriteTermlike "term" DD.rewriteTerm rewriteCase = rewriteTermlike "case" DD.rewriteCase @@ -164,13 +164,13 @@ match :: (Monad m, Var v) => TermP v m match = do start <- openBlockWith "match" scrutinee <- term - _ <- closeBlock + _ <- optionalCloseBlock _ <- P.try (openBlockWith "with") <|> do t <- anyToken P.customFailure (ExpectedBlockOpen "with" t) (_arities, cases) <- NonEmpty.unzip <$> matchCases1 start - _ <- closeBlock + _ <- optionalCloseBlock pure $ Term.match (ann start <> ann (NonEmpty.last cases)) @@ -212,10 +212,10 @@ matchCase = do [ Nothing <$ P.try (quasikeyword "otherwise"), Just <$> infixAppOrBooleanOp ] - (_spanAnn, t) <- block "->" + (_spanAnn, t) <- layoutBlock "->" pure (guard, t) let unguardedBlock = label "case match" do - (_spanAnn, t) <- block "->" + (_spanAnn, t) <- layoutBlock "->" pure (Nothing, t) -- a pattern's RHS is either one or more guards, or a single unguarded block. guardsAndBlocks <- guardedBlocks <|> (pure @[] <$> unguardedBlock) @@ -357,10 +357,10 @@ lam p = label "lambda" $ mkLam <$> P.try (some prefixDefinitionName <* reserved in Term.lam' (ann (head vs) <> ann b) annotatedArgs b letBlock, handle, ifthen :: (Monad m, Var v) => TermP v m -letBlock = label "let" $ (snd <$> block "let") +letBlock = label "let" $ (snd <$> layoutBlock "let") handle = label "handle" do (handleSpan, b) <- block "handle" - (_withSpan, handler) <- block "with" + (_withSpan, handler) <- layoutBlock "with" -- We don't use the annotation span from 'with' here because it will -- include a dedent if it's at the end of block. -- Meaning the newline gets overwritten when pretty-printing and it messes things up. @@ -377,7 +377,7 @@ lamCase = do start <- openBlockWith "cases" cases <- matchCases1 start (arity, cases) <- checkCasesArities cases - _ <- closeBlock + _ <- optionalCloseBlock lamvars <- replicateM arity (Parser.uniqueName 10) let vars = Var.named <$> [tweak v i | (v, i) <- lamvars `zip` [(1 :: Int) ..]] @@ -396,7 +396,7 @@ ifthen = label "if" do start <- peekAny (_spanAnn, c) <- block "if" (_spanAnn, t) <- block "then" - (_spanAnn, f) <- block "else" + (_spanAnn, f) <- layoutBlock "else" pure $ Term.iff (ann start <> ann f) c t f text :: (Var v) => TermP v m @@ -987,7 +987,7 @@ delayQuote = P.label "quote" do delayBlock :: (Monad m, Var v) => P v m (Ann {- Ann spanning the whole block -}, Term v Ann) delayBlock = P.label "do" do - (spanAnn, b) <- block "do" + (spanAnn, b) <- layoutBlock "do" let argSpan = (ann b {- would be nice to use the annotation for 'do' here, but it's not terribly important -}) pure $ (spanAnn, DD.delayTerm (ann b) argSpan b) @@ -1074,7 +1074,7 @@ destructuringBind = do let boundVars' = snd <$> boundVars _ <- P.lookAhead (openBlockWith "=") pure (p, boundVars') - (_spanAnn, scrute) <- block "=" -- Dwight K. Scrute ("The People's Scrutinee") + (_spanAnn, scrute) <- layoutBlock "=" -- Dwight K. Scrute ("The People's Scrutinee") let guard = Nothing let absChain vs t = foldr (\v t -> ABT.abs' (ann t) v t) t vs thecase t = Term.MatchCase p (fmap (absChain boundVars) guard) $ absChain boundVars t @@ -1144,6 +1144,9 @@ customFailure = P.customFailure block :: forall m v. (Monad m, Var v) => String -> P v m (Ann, Term v Ann) block s = block' False False s (openBlockWith s) closeBlock +layoutBlock :: forall m v. (Monad m, Var v) => String -> P v m (Ann, Term v Ann) +layoutBlock s = block' False False s (openBlockWith s) optionalCloseBlock + -- example: use Foo.bar.Baz + ++ x -- + ++ and x are called the "suffixes" of the `use` statement, and -- `Foo.bar.Baz` is called the prefix. A `use` statement has the effect diff --git a/unison-src/transcripts-round-trip/reparses-with-same-hash.u b/unison-src/transcripts-round-trip/reparses-with-same-hash.u index 98fbe28a57..5d75eff442 100644 --- a/unison-src/transcripts-round-trip/reparses-with-same-hash.u +++ b/unison-src/transcripts-round-trip/reparses-with-same-hash.u @@ -542,7 +542,7 @@ fix_4384d = {{ {{ docExampleBlock 0 '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17, fix_4384e = id : x -> x id x = x - {{ {{ docExampleBlock 0 (id id id id id id id id id id id id id id id id id id id id id (x -> 0) }} }} + {{ {{ docExampleBlock 0 (id id id id id id id id id id id id id id id id id id id id id (x -> 0)) }} }} fnApplicationSyntax = Environment.default = do 1 + 1 diff --git a/unison-src/transcripts/error-messages.output.md b/unison-src/transcripts/error-messages.output.md index 0b3e334aa6..03e7e652ac 100644 --- a/unison-src/transcripts/error-messages.output.md +++ b/unison-src/transcripts/error-messages.output.md @@ -290,6 +290,7 @@ x = match Some a with I was surprised to find a -> here. I was expecting one of these instead: + * end of input * newline or semicolon ``` @@ -312,6 +313,7 @@ x = match Some a with I was surprised to find a '|' here. I was expecting one of these instead: + * end of input * newline or semicolon ``` diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index fd48d6abb3..2ca3dc3738 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -288,9 +288,9 @@ showErrorFancy = \case GT -> "greater than " P.ErrorCustom a -> P.showErrorComponent a -lexer0' :: String -> String -> [Token Lexeme] -lexer0' scope rem = - case flip S.evalState env0 $ P.runParserT lexemes scope rem of +lexer :: String -> String -> [Token Lexeme] +lexer scope rem = + case flip S.evalState env0 $ P.runParserT (lexemes eof) scope rem of Left e -> let errsWithSourcePos = fst $ @@ -326,8 +326,14 @@ lexer0' scope rem = endPos = startPos & \(Pos l c) -> Pos l (c + errorLength) in [Token (Err err) startPos endPos] in errsWithSourcePos >>= errorToTokens - Right ts -> Token (Open scope) topLeftCorner topLeftCorner : tweak ts + Right ts -> postLex $ Token (Open scope) topLeftCorner topLeftCorner : ts where + eof :: P [Token Lexeme] + eof = P.try do + p <- P.eof >> posP + n <- maybe 0 (const 1) <$> S.gets opening + l <- S.gets layout + pure $ replicate (length l + n) (Token Close p p) errorItemToString :: EP.ErrorItem Char -> String errorItemToString = \case (P.Tokens ts) -> Foldable.toList ts @@ -336,28 +342,31 @@ lexer0' scope rem = customErrs es = [Err <$> e | P.ErrorCustom e <- toList es] toPos (P.SourcePos _ line col) = Pos (P.unPos line) (P.unPos col) env0 = ParsingEnv [] (Just scope) True [0] 0 - -- hacky postprocessing pass to do some cleanup of stuff that's annoying to - -- fix without adding more state to the lexer: - -- - 1+1 lexes as [1, +1], convert this to [1, +, 1] - -- - when a semi followed by a virtual semi, drop the virtual, lets you - -- write - -- foo x = action1; - -- 2 - -- - semi immediately after first Open is ignored - tweak [] = [] - tweak (h@(payload -> Semi False) : (payload -> Semi True) : t) = h : tweak t - tweak (h@(payload -> Reserved _) : t) = h : tweak t - tweak (t1 : t2@(payload -> Numeric num) : rem) - | notLayout t1 && touches t1 t2 && isSigned num = - t1 - : Token - (SymbolyId (HQ'.fromName (Name.unsafeParseText (Text.pack (take 1 num))))) - (start t2) - (inc $ start t2) - : Token (Numeric (drop 1 num)) (inc $ start t2) (end t2) - : tweak rem - tweak (h : t) = h : tweak t + +-- | hacky postprocessing pass to do some cleanup of stuff that's annoying to +-- fix without adding more state to the lexer: +-- - 1+1 lexes as [1, +1], convert this to [1, +, 1] +-- - when a semi followed by a virtual semi, drop the virtual, lets you +-- write +-- foo x = action1; +-- 2 +-- - semi immediately after first Open is ignored +tweak :: (Token Lexeme) -> [Token Lexeme] -> [Token Lexeme] +tweak h@(Token (Semi False) _ _) (Token (Semi True) _ _ : t) = h : t +-- __NB__: This case only exists to guard against the following one +tweak h@(Token (Reserved _) _ _) t = h : t +tweak t1 (t2@(Token (Numeric num) _ _) : rem) + | notLayout t1 && touches t1 t2 && isSigned num = + t1 + : Token + (SymbolyId (HQ'.fromName (Name.unsafeParseText (Text.pack (take 1 num))))) + (start t2) + (inc $ start t2) + : Token (Numeric (drop 1 num)) (inc $ start t2) (end t2) + : rem + where isSigned num = all (\ch -> ch == '-' || ch == '+') $ take 1 num +tweak h t = h : t formatTrivialError :: Set String -> Set String -> [Char] formatTrivialError unexpectedTokens expectedTokens = @@ -377,7 +386,7 @@ formatTrivialError unexpectedTokens expectedTokens = displayLexeme :: Lexeme -> String displayLexeme = \case Open o -> o - Semi True -> "end of section" + Semi True -> "end of stanza" Semi False -> "semicolon" Close -> "end of section" Reserved r -> "'" <> r <> "'" @@ -397,16 +406,6 @@ infixl 2 <+> (<+>) :: (Monoid a) => P a -> P a -> P a p1 <+> p2 = do a1 <- p1; a2 <- p2; pure (a1 <> a2) -lexemes :: P [Token Lexeme] -lexemes = lexemes' eof - where - eof :: P [Token Lexeme] - eof = P.try do - p <- P.eof >> posP - n <- maybe 0 (const 1) <$> S.gets opening - l <- S.gets layout - pure $ replicate (length l + n) (Token Close p p) - -- Runs the parser `p`, then: -- 1. resets the layout stack to be what it was before `p`. -- 2. emits enough closing tokens to reach `lbl` but not pop it. @@ -998,7 +997,14 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) wrapSimple2 fn a b = ann a <> ann b :< fn a b lexemes' :: P [Token Lexeme] -> P [Token Lexeme] -lexemes' eof = +lexemes' = + -- NB: `postLex` requires the token stream to start with an `Open`, otherwise it can’t create a `T`, so this adds one, + -- runs `postLex`, then removes it. + fmap (tail . postLex . (Token (Open "fake") mempty mempty :)) . lexemes + +-- | Consumes an entire Unison “module”. +lexemes :: P [Token Lexeme] -> P [Token Lexeme] +lexemes eof = P.optional space >> do hd <- join <$> P.manyTill toks (P.lookAhead eof) tl <- eof @@ -1581,8 +1587,11 @@ reorder = foldr fixup [] . join . sortWith f . stanzas preParse :: [Token Lexeme] -> T (Token Lexeme) preParse = reorderTree reorder . tree -lexer :: String -> String -> [Token Lexeme] -lexer scope = toList . preParse . lexer0' scope +-- | A few transformations that happen between lexing and parsing. +-- +-- All of these things should move out of the lexer, and be applied in the parse. +postLex :: [Token Lexeme] -> [Token Lexeme] +postLex = toList . preParse . foldr tweak [] isDelayOrForce :: Char -> Bool isDelayOrForce op = op == '\'' || op == '!' diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 733ecc93cf..498e460f3f 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -17,6 +17,7 @@ module Unison.Syntax.Parser chainr1, character, closeBlock, + optionalCloseBlock, doc, failCommitted, failureIf, @@ -270,6 +271,11 @@ semi = label "newline or semicolon" $ queryToken go closeBlock :: (Ord v) => P v m (L.Token ()) closeBlock = void <$> matchToken L.Close +-- | With layout, blocks might “close” without an explicit outdent (e.g., not even a newline at the end of a +-- `DocTransclude`). This allows those blocks to be closed by EOF. +optionalCloseBlock :: (Ord v) => P v m (L.Token ()) +optionalCloseBlock = closeBlock <|> (\() -> L.Token () mempty mempty) <$> P.eof + wordyPatternName :: (Var v) => P v m (L.Token v) wordyPatternName = queryToken \case L.WordyId (HQ'.NameOnly n) -> Just $ Name.toVar n From 94065e06104b155a66fd8268e6498e0540ba5108 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 24 Jul 2024 19:54:06 -0600 Subject: [PATCH 19/50] Make comments into Haddock --- unison-syntax/src/Unison/Syntax/Lexer.hs | 13 +++--- unison-syntax/src/Unison/Syntax/Parser.hs | 55 ++++++++++++----------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 2ca3dc3738..ce2d63b564 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -90,17 +90,16 @@ type BlockName = String type Layout = [(BlockName, Column)] data ParsingEnv = ParsingEnv - { -- layout stack + { -- | layout stack layout :: !Layout, - -- `Just b` if a block of type `b` is being opened + -- | `Just b` if a block of type `b` is being opened opening :: Maybe BlockName, - -- are we inside a construct that uses layout? + -- | are we inside a construct that uses layout? inLayout :: Bool, - -- Use a stack to remember the parent section and - -- allow docSections within docSections. - -- 1 means we are inside a # Heading 1 + -- | Use a stack to remember the parent section and allow docSections within docSections. + -- - 1 means we are inside a # Heading 1 parentSections :: [Int], - -- 4 means we are inside a list starting at the fourth column + -- | 4 means we are inside a list starting at the fourth column parentListColumn :: Int } deriving (Show) diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 498e460f3f..e12a2a94c4 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -158,19 +158,20 @@ data Error v | UnknownType (L.Token (HQ.HashQualified Name)) (Set Reference) | UnknownId (L.Token (HQ.HashQualified Name)) (Set Referent) (Set Reference) | ExpectedBlockOpen String (L.Token L.Lexeme) - | -- Indicates a cases or match/with which doesn't have any patterns + | -- | Indicates a cases or match/with which doesn't have any patterns EmptyMatch (L.Token ()) | EmptyWatch Ann | UseInvalidPrefixSuffix (Either (L.Token Name) (L.Token Name)) (Maybe [L.Token Name]) | UseEmpty (L.Token String) -- an empty `use` statement | DidntExpectExpression (L.Token L.Lexeme) (Maybe (L.Token L.Lexeme)) | TypeDeclarationErrors [UF.Error v Ann] - | -- MissingTypeModifier (type|ability) name + | -- | MissingTypeModifier (type|ability) name MissingTypeModifier (L.Token String) (L.Token v) | ResolutionFailures [Names.ResolutionFailure v Ann] | DuplicateTypeNames [(v, [Ann])] | DuplicateTermNames [(v, [Ann])] - | PatternArityMismatch Int Int Ann -- PatternArityMismatch expectedArity actualArity location + | -- | PatternArityMismatch expectedArity actualArity location + PatternArityMismatch Int Int Ann | FloatPattern Ann deriving (Show, Eq, Ord) @@ -242,11 +243,11 @@ run' p s name env = run :: (Monad m, Ord v) => P v m a -> String -> ParsingEnv m -> m (Either (Err v) a) run p s = run' p s "" --- Virtual pattern match on a lexeme. +-- | Virtual pattern match on a lexeme. queryToken :: (Ord v) => (L.Lexeme -> Maybe a) -> P v m (L.Token a) queryToken f = P.token (traverse f) Set.empty --- Consume a block opening and return the string that opens the block. +-- | Consume a block opening and return the string that opens the block. openBlock :: (Ord v) => P v m (L.Token String) openBlock = queryToken getOpen where @@ -256,23 +257,23 @@ openBlock = queryToken getOpen openBlockWith :: (Ord v) => String -> P v m (L.Token ()) openBlockWith s = void <$> P.satisfy ((L.Open s ==) . L.payload) --- Match a particular lexeme exactly, and consume it. +-- | Match a particular lexeme exactly, and consume it. matchToken :: (Ord v) => L.Lexeme -> P v m (L.Token L.Lexeme) matchToken x = P.satisfy ((==) x . L.payload) --- Consume a virtual semicolon +-- | Consume a virtual semicolon semi :: (Ord v) => P v m (L.Token ()) semi = label "newline or semicolon" $ queryToken go where go (L.Semi _) = Just () go _ = Nothing --- Consume the end of a block +-- | Consume the end of a block closeBlock :: (Ord v) => P v m (L.Token ()) closeBlock = void <$> matchToken L.Close -- | With layout, blocks might “close” without an explicit outdent (e.g., not even a newline at the end of a --- `DocTransclude`). This allows those blocks to be closed by EOF. +-- `DocTransclude`). This allows those blocks to be closed by EOF. optionalCloseBlock :: (Ord v) => P v m (L.Token ()) optionalCloseBlock = closeBlock <|> (\() -> L.Token () mempty mempty) <$> P.eof @@ -281,13 +282,13 @@ wordyPatternName = queryToken \case L.WordyId (HQ'.NameOnly n) -> Just $ Name.toVar n _ -> Nothing --- Parse an prefix identifier e.g. Foo or (+), discarding any hash +-- | Parse a prefix identifier e.g. Foo or (+), discarding any hash prefixDefinitionName :: (Var v) => P v m (L.Token v) prefixDefinitionName = wordyDefinitionName <|> parenthesize symbolyDefinitionName --- Parse a prefix identifier e.g. Foo or (+), rejecting any hash --- This is useful for term declarations, where type signatures and term names should not have hashes. +-- | Parse a prefix identifier e.g. Foo or (+), rejecting any hash +-- This is useful for term declarations, where type signatures and term names should not have hashes. prefixTermName :: (Var v) => P v m (L.Token v) prefixTermName = wordyTermName <|> parenthesize symbolyTermName where @@ -299,34 +300,34 @@ prefixTermName = wordyTermName <|> parenthesize symbolyTermName L.SymbolyId (HQ'.NameOnly n) -> Just $ Name.toVar n _ -> Nothing --- Parse a wordy identifier e.g. Foo, discarding any hash +-- | Parse a wordy identifier e.g. Foo, discarding any hash wordyDefinitionName :: (Var v) => P v m (L.Token v) wordyDefinitionName = queryToken $ \case L.WordyId n -> Just $ Name.toVar (HQ'.toName n) L.Blank s -> Just $ Var.nameds ("_" <> s) _ -> Nothing --- Parse a wordyId as a Name, rejecting any hash +-- | Parse a wordyId as a Name, rejecting any hash importWordyId :: (Ord v) => P v m (L.Token Name) importWordyId = queryToken \case L.WordyId (HQ'.NameOnly n) -> Just n L.Blank s | not (null s) -> Just $ Name.unsafeParseText (Text.pack ("_" <> s)) _ -> Nothing --- The `+` in: use Foo.bar + as a Name +-- | The `+` in: use Foo.bar + as a Name importSymbolyId :: (Ord v) => P v m (L.Token Name) importSymbolyId = queryToken \case L.SymbolyId (HQ'.NameOnly n) -> Just n _ -> Nothing --- Parse a symboly ID like >>= or &&, discarding any hash +-- | Parse a symboly ID like >>= or &&, discarding any hash symbolyDefinitionName :: (Var v) => P v m (L.Token v) symbolyDefinitionName = queryToken $ \case L.SymbolyId n -> Just $ Name.toVar (HQ'.toName n) _ -> Nothing -- | Expect parentheses around a token, includes the parentheses within the start/end --- annotations of the resulting token. +-- annotations of the resulting token. parenthesize :: (Ord v) => P v m (L.Token a) -> P v m (L.Token a) parenthesize p = do (start, a) <- P.try do @@ -340,7 +341,7 @@ hqPrefixId, hqInfixId :: (Ord v) => P v m (L.Token (HQ.HashQualified Name)) hqPrefixId = hqWordyId_ <|> parenthesize hqSymbolyId_ hqInfixId = hqSymbolyId_ --- Parse a hash-qualified alphanumeric identifier +-- | Parse a hash-qualified alphanumeric identifier hqWordyId_ :: (Ord v) => P v m (L.Token (HQ.HashQualified Name)) hqWordyId_ = queryToken \case L.WordyId n -> Just $ HQ'.toHQ n @@ -348,20 +349,20 @@ hqWordyId_ = queryToken \case L.Blank s | not (null s) -> Just $ HQ.NameOnly (Name.unsafeParseText (Text.pack ("_" <> s))) _ -> Nothing --- Parse a hash-qualified symboly ID like >>=#foo or && +-- | Parse a hash-qualified symboly ID like >>=#foo or && hqSymbolyId_ :: (Ord v) => P v m (L.Token (HQ.HashQualified Name)) hqSymbolyId_ = queryToken \case L.SymbolyId n -> Just (HQ'.toHQ n) _ -> Nothing --- Parse a reserved word +-- | Parse a reserved word reserved :: (Ord v) => String -> P v m (L.Token String) reserved w = label w $ queryToken getReserved where getReserved (L.Reserved w') | w == w' = Just w getReserved _ = Nothing --- Parse a placeholder or typed hole +-- | Parse a placeholder or typed hole blank :: (Ord v) => P v m (L.Token String) blank = label "blank" $ queryToken getBlank where @@ -436,12 +437,12 @@ chainr1 p op = go1 go1 = p >>= go2 go2 hd = do { op <- op; op hd <$> go1 } <|> pure hd --- Parse `p` 1+ times, combining with `op` +-- | Parse `p` 1+ times, combining with `op` chainl1 :: (Ord v) => P v m a -> P v m (a -> a -> a) -> P v m a chainl1 p op = foldl (flip ($)) <$> p <*> P.many (flip <$> op <*> p) --- If `p` would succeed, this fails uncommitted. --- Otherwise, `failIfOk` used to produce the output +-- | If `p` would succeed, this fails uncommitted. +-- Otherwise, `failIfOk` used to produce the output failureIf :: (Ord v) => P v m (P v m b) -> P v m a -> P v m b failureIf failIfOk p = do dontwant <- P.try . P.lookAhead $ failIfOk @@ -449,9 +450,9 @@ failureIf failIfOk p = do when (isJust p) $ fail "failureIf" dontwant --- Gives this var an id based on its position - a useful trick to --- obtain a variable whose id won't match any other id in the file --- `positionalVar a Var.missingResult` +-- | Gives this var an id based on its position - a useful trick to +-- obtain a variable whose id won't match any other id in the file +-- `positionalVar a Var.missingResult` positionalVar :: (Annotated a, Var v) => a -> v -> v positionalVar a v = let s = start (ann a) From 567238fae8dccfe399e935c919e93a363a30a03a Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 22 Jul 2024 16:44:17 -0600 Subject: [PATCH 20/50] Expose `preParse` to the parser --- parser-typechecker/src/Unison/PrintError.hs | 2 +- unison-syntax/src/Unison/Syntax/Lexer.hs | 28 ++++++++------------- unison-syntax/src/Unison/Syntax/Parser.hs | 11 +++----- unison-syntax/test/Main.hs | 4 +-- 4 files changed, 18 insertions(+), 27 deletions(-) diff --git a/parser-typechecker/src/Unison/PrintError.hs b/parser-typechecker/src/Unison/PrintError.hs index 5647ccde63..8b73b179f1 100644 --- a/parser-typechecker/src/Unison/PrintError.hs +++ b/parser-typechecker/src/Unison/PrintError.hs @@ -1336,7 +1336,7 @@ prettyParseError s e = lexerOutput :: Pretty (AnnotatedText a) lexerOutput = if showLexerOutput - then "\nLexer output:\n" <> fromString (L.debugLex' s) + then "\nLexer output:\n" <> fromString (L.debugPreParse' s) else mempty renderParseErrors :: diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index ce2d63b564..dc755e7c79 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -19,11 +19,11 @@ module Unison.Syntax.Lexer DocJoin (..), DocEmbedAnnotation (..), lexer, + preParse, escapeChars, - debugFileLex, - debugLex', - debugLex'', - debugLex''', + debugFilePreParse, + debugPreParse, + debugPreParse', showEscapeChar, touches, @@ -1628,14 +1628,11 @@ typeModifiersAlt f = inc :: Pos -> Pos inc (Pos line col) = Pos line (col + 1) -debugFileLex :: String -> IO () -debugFileLex file = do - contents <- readUtf8 file - let s = debugLex'' (lexer file (Text.unpack contents)) - putStrLn s +debugFilePreParse :: FilePath -> IO () +debugFilePreParse file = putStrLn . debugPreParse . preParse . lexer file . Text.unpack =<< readUtf8 file -debugLex'' :: [Token Lexeme] -> String -debugLex'' [Token (Err (UnexpectedTokens msg)) start end] = +debugPreParse :: T (Token Lexeme) -> String +debugPreParse (L (Token (Err (UnexpectedTokens msg)) start end)) = (if start == end then msg1 else msg2) <> ":\n" <> msg where msg1 = "Error on line " <> show (line start) <> ", column " <> show (column start) @@ -1648,13 +1645,10 @@ debugLex'' [Token (Err (UnexpectedTokens msg)) start end] = <> show (line end) <> ", column " <> show (column end) -debugLex'' ts = show . fmap payload . tree $ ts +debugPreParse ts = show $ payload <$> ts -debugLex' :: String -> String -debugLex' = debugLex'' . lexer "debugLex" - -debugLex''' :: String -> String -> String -debugLex''' s = debugLex'' . lexer s +debugPreParse' :: String -> String +debugPreParse' = debugPreParse . preParse . lexer "debugPreParse" instance EP.ShowErrorComponent (Token Err) where showErrorComponent (Token err _ _) = go err diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index e12a2a94c4..344de0fd1b 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -61,6 +61,7 @@ where import Control.Monad.Reader (ReaderT (..)) import Control.Monad.Reader.Class (asks) import Crypto.Random qualified as Random +import Data.Bool (bool) import Data.Bytes.Put (runPutS) import Data.Bytes.Serial (serialize) import Data.Bytes.VarInt (VarInt (..)) @@ -199,8 +200,7 @@ label = P.label traceRemainingTokens :: (Ord v) => String -> P v m () traceRemainingTokens label = do remainingTokens <- lookAhead $ many anyToken - let _ = - trace ("REMAINDER " ++ label ++ ":\n" ++ L.debugLex'' remainingTokens) () + let _ = trace ("REMAINDER " ++ label ++ ":\n" ++ L.debugPreParse (L.preParse remainingTokens)) () pure () mkAnn :: (Annotated a, Annotated b) => a -> b -> Ann @@ -231,12 +231,9 @@ rootFile p = p <* P.eof run' :: (Monad m, Ord v) => P v m a -> String -> String -> ParsingEnv m -> m (Either (Err v) a) run' p s name env = - let lex = - if debug - then L.lexer name (trace (L.debugLex''' "lexer receives" s) s) - else L.lexer name s + let lex = bool id (traceWith L.debugPreParse) debug . L.preParse $ L.lexer name s pTraced = traceRemainingTokens "parser receives" *> p - in runReaderT (runParserT pTraced name (Input lex)) env <&> \case + in runReaderT (runParserT pTraced name . Input $ toList lex) env <&> \case Left err -> Left (Nel.head (P.bundleErrors err)) Right x -> Right x diff --git a/unison-syntax/test/Main.hs b/unison-syntax/test/Main.hs index bd40c7ded8..5c13940b0a 100644 --- a/unison-syntax/test/Main.hs +++ b/unison-syntax/test/Main.hs @@ -221,8 +221,8 @@ test = t :: String -> [Lexeme] -> Test () t s expected = - let actual0 = payload <$> lexer "ignored filename" s - actual = take (length actual0 - 2) . drop 1 $ actual0 + let actual0 = payload <$> preParse (lexer "ignored filename" s) + actual = take (length actual0 - 2) . drop 1 $ toList actual0 in scope s $ if actual == expected then ok From 6c561f314628c1e8b6e77fa6c7e0118f67265a71 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 22 Jul 2024 16:45:21 -0600 Subject: [PATCH 21/50] Rename `T` to `BlockTree` --- unison-syntax/src/Unison/Syntax/Lexer.hs | 49 +++++++++++++----------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index dc755e7c79..a83d4da38b 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -1520,15 +1520,18 @@ pop = drop 1 topLeftCorner :: Pos topLeftCorner = Pos 1 1 -data T a = T a [T a] [a] | L a deriving (Functor, Foldable, Traversable) - -headToken :: T a -> a -headToken (T a _ _) = a -headToken (L a) = a - -instance (Show a) => Show (T a) where - show (L a) = show a - show (T open mid close) = +data BlockTree a + = Block a [BlockTree a] [a] + | Leaf a + deriving (Functor, Foldable, Traversable) + +headToken :: BlockTree a -> a +headToken (Block a _ _) = a +headToken (Leaf a) = a + +instance (Show a) => Show (BlockTree a) where + show (Leaf a) = show a + show (Block open mid close) = show open ++ "\n" ++ indent " " (intercalateMap "\n" show mid) @@ -1539,26 +1542,26 @@ instance (Show a) => Show (T a) where go by '\n' = '\n' : by go _ c = [c] -reorderTree :: ([T a] -> [T a]) -> T a -> T a -reorderTree _ l@(L _) = l -reorderTree f (T open mid close) = T open (f (reorderTree f <$> mid)) close +reorderTree :: ([BlockTree a] -> [BlockTree a]) -> BlockTree a -> BlockTree a +reorderTree f (Block open mid close) = Block open (f (reorderTree f <$> mid)) close +reorderTree _ l = l -tree :: [Token Lexeme] -> T (Token Lexeme) +tree :: [Token Lexeme] -> BlockTree (Token Lexeme) tree toks = one toks const where - one (open@(payload -> Open _) : ts) k = many (T open) [] ts k - one (t : ts) k = k (L t) ts + one (open@(payload -> Open _) : ts) k = many (Block open) [] ts k + one (t : ts) k = k (Leaf t) ts one [] k = k lastErr [] where - lastErr = case drop (length toks - 1) toks of - [] -> L (Token (Err LayoutError) topLeftCorner topLeftCorner) - (t : _) -> L $ t {payload = Err LayoutError} + lastErr = Leaf case drop (length toks - 1) toks of + [] -> Token (Err LayoutError) topLeftCorner topLeftCorner + (t : _) -> t {payload = Err LayoutError} many open acc [] k = k (open (reverse acc) []) [] many open acc (t@(payload -> Close) : ts) k = k (open (reverse acc) [t]) ts many open acc ts k = one ts $ \t ts -> many open (t : acc) ts k -stanzas :: [T (Token Lexeme)] -> [[T (Token Lexeme)]] +stanzas :: [BlockTree (Token Lexeme)] -> [[BlockTree (Token Lexeme)]] stanzas = go [] where go acc [] = [reverse acc] @@ -1568,7 +1571,7 @@ stanzas = go [] -- Moves type and ability declarations to the front of the token stream -- and move `use` statements to the front of each block -reorder :: [T (Token Lexeme)] -> [T (Token Lexeme)] +reorder :: [BlockTree (Token Lexeme)] -> [BlockTree (Token Lexeme)] reorder = foldr fixup [] . join . sortWith f . stanzas where f [] = 3 :: Int @@ -1583,7 +1586,7 @@ reorder = foldr fixup [] . join . sortWith f . stanzas fixup tok tail = tok : tail -- | This turns the lexeme stream into a tree, reordering some lexeme subsequences. -preParse :: [Token Lexeme] -> T (Token Lexeme) +preParse :: [Token Lexeme] -> BlockTree (Token Lexeme) preParse = reorderTree reorder . tree -- | A few transformations that happen between lexing and parsing. @@ -1631,8 +1634,8 @@ inc (Pos line col) = Pos line (col + 1) debugFilePreParse :: FilePath -> IO () debugFilePreParse file = putStrLn . debugPreParse . preParse . lexer file . Text.unpack =<< readUtf8 file -debugPreParse :: T (Token Lexeme) -> String -debugPreParse (L (Token (Err (UnexpectedTokens msg)) start end)) = +debugPreParse :: BlockTree (Token Lexeme) -> String +debugPreParse (Leaf (Token (Err (UnexpectedTokens msg)) start end)) = (if start == end then msg1 else msg2) <> ":\n" <> msg where msg1 = "Error on line " <> show (line start) <> ", column " <> show (column start) From 3158e666033b124c17ec20ff7f96549548533703 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Tue, 23 Jul 2024 13:59:12 -0600 Subject: [PATCH 22/50] Restructure `BlockTree` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We now build the stanzas at the same time as the tree, and don’t discard them after reordering. This also changes the closing element of `Block` to be `Maybe` instead of `[]`. --- unison-syntax/src/Unison/Syntax/Lexer.hs | 50 +++++++++++++++--------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index a83d4da38b..a356757bf7 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -39,6 +39,7 @@ module Unison.Syntax.Lexer where import Control.Comonad.Cofree (Cofree ((:<))) +import Control.Lens qualified as Lens import Control.Monad.State qualified as S import Data.Char (isAlphaNum, isControl, isDigit, isSpace, ord, toLower) import Data.Foldable qualified as Foldable @@ -1521,7 +1522,13 @@ topLeftCorner :: Pos topLeftCorner = Pos 1 1 data BlockTree a - = Block a [BlockTree a] [a] + = Block + -- | The token that opens the block + a + -- | “Stanzas” of nested tokens + [[BlockTree a]] + -- | The closing token, if any + (Maybe a) | Leaf a deriving (Functor, Foldable, Traversable) @@ -1534,22 +1541,22 @@ instance (Show a) => Show (BlockTree a) where show (Block open mid close) = show open ++ "\n" - ++ indent " " (intercalateMap "\n" show mid) + ++ indent " " (intercalateMap "\n" (intercalateMap " " show) mid) ++ "\n" - ++ intercalateMap "" show close + ++ maybe "" show close where indent by s = by ++ (s >>= go by) go by '\n' = '\n' : by go _ c = [c] -reorderTree :: ([BlockTree a] -> [BlockTree a]) -> BlockTree a -> BlockTree a -reorderTree f (Block open mid close) = Block open (f (reorderTree f <$> mid)) close +reorderTree :: ([[BlockTree a]] -> [[BlockTree a]]) -> BlockTree a -> BlockTree a +reorderTree f (Block open mid close) = Block open (f (fmap (reorderTree f) <$> mid)) close reorderTree _ l = l tree :: [Token Lexeme] -> BlockTree (Token Lexeme) tree toks = one toks const where - one (open@(payload -> Open _) : ts) k = many (Block open) [] ts k + one (open@(payload -> Open _) : ts) k = many (Block open . stanzas) [] ts k one (t : ts) k = k (Leaf t) ts one [] k = k lastErr [] where @@ -1557,22 +1564,24 @@ tree toks = one toks const [] -> Token (Err LayoutError) topLeftCorner topLeftCorner (t : _) -> t {payload = Err LayoutError} - many open acc [] k = k (open (reverse acc) []) [] - many open acc (t@(payload -> Close) : ts) k = k (open (reverse acc) [t]) ts + many open acc [] k = k (open (reverse acc) Nothing) [] + many open acc (t@(payload -> Close) : ts) k = k (open (reverse acc) $ pure t) ts many open acc ts k = one ts $ \t ts -> many open (t : acc) ts k stanzas :: [BlockTree (Token Lexeme)] -> [[BlockTree (Token Lexeme)]] -stanzas = go [] - where - go acc [] = [reverse acc] - go acc (t : ts) = case payload $ headToken t of - Semi _ -> reverse (t : acc) : go [] ts - _ -> go (t : acc) ts +stanzas = + toList + . foldr + ( \tok (curr :| stanzas) -> case tok of + Leaf (Token (Semi _) _ _) -> [tok] :| curr : stanzas + _ -> (tok : curr) :| stanzas + ) + ([] :| []) -- Moves type and ability declarations to the front of the token stream -- and move `use` statements to the front of each block -reorder :: [BlockTree (Token Lexeme)] -> [BlockTree (Token Lexeme)] -reorder = foldr fixup [] . join . sortWith f . stanzas +reorder :: [[BlockTree (Token Lexeme)]] -> [[BlockTree (Token Lexeme)]] +reorder = foldr fixup [] . sortWith f where f [] = 3 :: Int f (t0 : _) = case payload $ headToken t0 of @@ -1582,8 +1591,13 @@ reorder = foldr fixup [] . join . sortWith f . stanzas _ -> 3 :: Int -- after reordering can end up with trailing semicolon at the end of -- a block, which we remove with this pass - fixup (payload . headToken -> Semi _) [] = [] - fixup tok tail = tok : tail + fixup stanza [] = case Lens.unsnoc stanza of + Nothing -> [] + -- remove any trailing `Semi` from the last non-empty stanza + Just (init, Leaf (Token (Semi _) _ _)) -> [init] + -- don’t touch other stanzas + Just (_, _) -> [stanza] + fixup stanza tail = stanza : tail -- | This turns the lexeme stream into a tree, reordering some lexeme subsequences. preParse :: [Token Lexeme] -> BlockTree (Token Lexeme) From a6f6d9c8dc35adabb6052eccb5e341b80083d317 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 24 Jul 2024 22:59:18 -0600 Subject: [PATCH 23/50] Remove unnecessary `docOpen` in Doc parser --- unison-syntax/src/Unison/Syntax/Lexer.hs | 2 -- 1 file changed, 2 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index a356757bf7..9f2119011a 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -653,7 +653,6 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) let end = P.lookAhead $ void docClose - <|> void docOpen <|> void (P.satisfy isSpace) <|> void closing word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end @@ -768,7 +767,6 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) pure ex docClose = [] <$ docClose' - docOpen = [] <$ lit "{{" link = P.label "link (examples: {type List}, {Nat.+})" $ From c53cb088e1262a2f06a81fc1dc66f685d86cc707 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 25 Jul 2024 10:28:33 -0600 Subject: [PATCH 24/50] Split `Doc` into its own module --- .../src/Unison/Syntax/TermParser.hs | 79 +++--- unison-syntax/src/Unison/Syntax/Lexer.hs | 232 ++++-------------- .../src/Unison/Syntax/Lexer/Token.hs | 4 + unison-syntax/src/Unison/Syntax/Parser.hs | 5 +- .../src/Unison/Syntax/Parser/Doc/Data.hs | 166 +++++++++++++ unison-syntax/unison-syntax.cabal | 1 + 6 files changed, 260 insertions(+), 227 deletions(-) create mode 100644 unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 999d5658ba..89d5504079 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -25,7 +25,7 @@ import Data.Sequence qualified as Sequence import Data.Set qualified as Set import Data.Text qualified as Text import Data.Tuple.Extra qualified as TupleE -import Data.Void (vacuous) +import Data.Void (absurd, vacuous) import Text.Megaparsec qualified as P import U.Core.ABT qualified as ABT import Unison.ABT qualified as ABT @@ -52,6 +52,7 @@ import Unison.Syntax.Name qualified as Name (toText, toVar, unsafeParseVar) import Unison.Syntax.NameSegment qualified as NameSegment import Unison.Syntax.Parser hiding (seq) import Unison.Syntax.Parser qualified as Parser (seq, uniqueName) +import Unison.Syntax.Parser.Doc.Data qualified as Doc import Unison.Syntax.TypeParser qualified as TypeParser import Unison.Term (IsTop, Term) import Unison.Term qualified as Term @@ -525,43 +526,43 @@ doc2Block = do f :: (Annotated a) => a -> String -> Term v Ann f a = Term.var (gann a) . Var.nameds . ("syntax.doc" <>) - docUntitledSection :: Ann -> L.DocUntitledSection (Term v Ann) -> Term v Ann - docUntitledSection ann (L.DocUntitledSection tops) = + docUntitledSection :: Ann -> Doc.UntitledSection (Term v Ann) -> Term v Ann + docUntitledSection ann (Doc.UntitledSection tops) = Term.app ann (f ann "UntitledSection") $ Term.list (gann tops) tops - docTop :: L.DocTop (Term v Ann) -> TermP v m + docTop :: Doc.Top [L.Token L.Lexeme] (Term v Ann) -> TermP v m docTop d = case d of - L.DocSection title body -> pure $ Term.apps' (f d "Section") [title, Term.list (gann body) body] - L.DocEval code -> + Doc.Section title body -> pure $ Term.apps' (f d "Section") [title, Term.list (gann body) body] + Doc.Eval code -> Term.app (gann d) (f d "Eval") . addDelay . snd <$> subParse (block' False False "syntax.docEval" (pure $ pure ()) $ Ann.External <$ P.eof) code - L.DocExampleBlock code -> + Doc.ExampleBlock code -> Term.apps' (f d "ExampleBlock") . (Term.nat (gann d) 0 :) . pure . addDelay . snd <$> subParse (block' False True "syntax.docExampleBlock" (pure $ pure ()) $ Ann.External <$ P.eof) code - L.DocCodeBlock label body -> + Doc.CodeBlock label body -> pure $ Term.apps' (f d "CodeBlock") [Term.text (ann label) . Text.pack $ L.payload label, Term.text (ann body) . Text.pack $ L.payload body] - L.DocBulletedList items -> + Doc.BulletedList items -> pure $ Term.app (gann d) (f d "BulletedList") . Term.list (gann items) . toList $ docColumn <$> items - L.DocNumberedList items@((n, _) :| _) -> + Doc.NumberedList items@((n, _) :| _) -> pure $ Term.apps' (f d "NumberedList") [Term.nat (ann d) $ L.payload n, Term.list (gann $ snd <$> items) . toList $ docColumn . snd <$> items] - L.DocParagraph leaves -> + Doc.Paragraph leaves -> Term.app (gann d) (f d "Paragraph") . Term.list (ann leaves) . toList <$> traverse docLeaf leaves - docColumn :: L.DocColumn (Term v Ann) -> Term v Ann - docColumn d@(L.DocColumn para sublist) = + docColumn :: Doc.Column (Term v Ann) -> Term v Ann + docColumn d@(Doc.Column para sublist) = Term.app (gann d) (f d "Column") . Term.list (gann d) $ para : toList sublist - docLeaf :: L.DocLeaf (Term v Ann) -> TermP v m + docLeaf :: Doc.Leaf [L.Token L.Lexeme] (Term v Ann) -> TermP v m docLeaf d = case d of - L.DocLink link -> Term.app (gann d) (f d "Link") <$> docEmbedLink link - L.DocNamedLink para target -> Term.apps' (f d "NamedLink") . (para :) . pure <$> docLeaf (vacuous target) - L.DocExample code -> do + Doc.Link link -> Term.app (gann d) (f d "Link") <$> docEmbedLink link + Doc.NamedLink para target -> Term.apps' (f d "NamedLink") . (para :) . pure <$> docLeaf (vacuous target) + Doc.Example code -> do trm <- subParse term code pure . Term.apps' (f d "Example") $ case trm of tm@(Term.Apps' _ xs) -> @@ -570,45 +571,45 @@ doc2Block = do lam = addDelay $ Term.lam' (ann tm) ((mempty,) <$> fvs) tm in [n, lam] tm -> [Term.nat (ann tm) 0, addDelay tm] - L.DocTransclude code -> Term.app (gann d) (f d "Transclude") <$> subParse term code - L.DocBold para -> pure $ Term.app (gann d) (f d "Bold") para - L.DocItalic para -> pure $ Term.app (gann d) (f d "Italic") para - L.DocStrikethrough para -> pure $ Term.app (gann d) (f d "Strikethrough") para - L.DocVerbatim leaf -> Term.app (gann d) (f d "Verbatim") <$> docLeaf (vacuous leaf) - L.DocCode leaf -> Term.app (gann d) (f d "Code") <$> docLeaf (vacuous leaf) - L.DocSource elems -> + Doc.Transclude code -> Term.app (gann d) (f d "Transclude") <$> subParse term code + Doc.Bold para -> pure $ Term.app (gann d) (f d "Bold") para + Doc.Italic para -> pure $ Term.app (gann d) (f d "Italic") para + Doc.Strikethrough para -> pure $ Term.app (gann d) (f d "Strikethrough") para + Doc.Verbatim leaf -> Term.app (gann d) (f d "Verbatim") <$> docLeaf (bimap absurd absurd leaf) + Doc.Code leaf -> Term.app (gann d) (f d "Code") <$> docLeaf (bimap absurd absurd leaf) + Doc.Source elems -> Term.app (gann d) (f d "Source") . Term.list (ann elems) . toList <$> traverse docSourceElement elems - L.DocFoldedSource elems -> + Doc.FoldedSource elems -> Term.app (gann d) (f d "FoldedSource") . Term.list (ann elems) . toList <$> traverse docSourceElement elems - L.DocEvalInline code -> Term.app (gann d) (f d "EvalInline") . addDelay <$> subParse term code - L.DocSignature links -> + Doc.EvalInline code -> Term.app (gann d) (f d "EvalInline") . addDelay <$> subParse term code + Doc.Signature links -> Term.app (gann d) (f d "Signature") . Term.list (ann links) . toList <$> traverse docEmbedSignatureLink links - L.DocSignatureInline link -> Term.app (gann d) (f d "SignatureInline") <$> docEmbedSignatureLink link - L.DocWord txt -> pure . Term.app (gann d) (f d "Word") . Term.text (ann txt) . Text.pack $ L.payload txt - L.DocGroup (L.DocJoin leaves) -> + Doc.SignatureInline link -> Term.app (gann d) (f d "SignatureInline") <$> docEmbedSignatureLink link + Doc.Word txt -> pure . Term.app (gann d) (f d "Word") . Term.text (ann txt) . Text.pack $ L.payload txt + Doc.Group (Doc.Join leaves) -> Term.app (gann d) (f d "Group") . Term.app (gann d) (f d "Join") . Term.list (ann leaves) . toList <$> traverse docLeaf leaves - docEmbedLink :: L.DocEmbedLink -> TermP v m + docEmbedLink :: Doc.EmbedLink -> TermP v m docEmbedLink d = case d of - L.DocEmbedTypeLink ident -> + Doc.EmbedTypeLink ident -> Term.app (gann d) (f d "EmbedTypeLink") . Term.typeLink (ann d) . L.payload <$> findUniqueType (HQ'.toHQ <$> ident) - L.DocEmbedTermLink ident -> + Doc.EmbedTermLink ident -> Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docSourceElement :: L.DocSourceElement -> TermP v m - docSourceElement d@(L.DocSourceElement link anns) = do + docSourceElement :: Doc.SourceElement [L.Token L.Lexeme] -> TermP v m + docSourceElement d@(Doc.SourceElement link anns) = do link' <- docEmbedLink link anns' <- traverse docEmbedAnnotation anns pure $ Term.apps' (f d "SourceElement") [link', Term.list (ann anns) anns'] - docEmbedSignatureLink :: L.DocEmbedSignatureLink -> TermP v m - docEmbedSignatureLink d@(L.DocEmbedSignatureLink ident) = + docEmbedSignatureLink :: Doc.EmbedSignatureLink -> TermP v m + docEmbedSignatureLink d@(Doc.EmbedSignatureLink ident) = Term.app (gann d) (f d "EmbedSignatureLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docEmbedAnnotation :: L.DocEmbedAnnotation -> TermP v m - docEmbedAnnotation d@(L.DocEmbedAnnotation a) = + docEmbedAnnotation :: Doc.EmbedAnnotation [L.Token L.Lexeme] -> TermP v m + docEmbedAnnotation d@(Doc.EmbedAnnotation a) = -- This is the only place I’m not sure we’re doing the right thing. In the lexer, this can be an identifier or a -- DocLeaf, but here it could be either /text/ or a Doc element. And I don’t think there’s any way the lexemes -- produced for an identifier and the lexemes consumed for text line up. So, I think this is a bugfix I can’t diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 9f2119011a..c0d1c3c04c 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -9,15 +9,6 @@ module Unison.Syntax.Lexer Pos (..), Lexeme (..), DocTree, - DocUntitledSection (..), - DocTop (..), - DocColumn (..), - DocLeaf (..), - DocEmbedLink (..), - DocSourceElement (..), - DocEmbedSignatureLink (..), - DocJoin (..), - DocEmbedAnnotation (..), lexer, preParse, escapeChars, @@ -43,7 +34,6 @@ import Control.Lens qualified as Lens import Control.Monad.State qualified as S import Data.Char (isAlphaNum, isControl, isDigit, isSpace, ord, toLower) import Data.Foldable qualified as Foldable -import Data.Functor.Classes import Data.List qualified as List import Data.List.Extra qualified as List import Data.List.NonEmpty (NonEmpty ((:|))) @@ -66,7 +56,7 @@ import Unison.Name qualified as Name import Unison.NameSegment (NameSegment) import Unison.NameSegment qualified as NameSegment (docSegment) import Unison.NameSegment.Internal qualified as NameSegment -import Unison.Parser.Ann (Ann (Ann), Annotated (..)) +import Unison.Parser.Ann (Ann, Annotated (..)) import Unison.Prelude import Unison.ShortHash (ShortHash) import Unison.ShortHash qualified as SH @@ -75,6 +65,7 @@ import Unison.Syntax.Lexer.Token (Token (..), posP, tokenP) import Unison.Syntax.Name qualified as Name (isSymboly, nameP, toText, unsafeParseText) import Unison.Syntax.NameSegment (symbolyIdChar, wordyIdChar, wordyIdStartChar) import Unison.Syntax.NameSegment qualified as NameSegment (ParseErr (..), wordyP) +import Unison.Syntax.Parser.Doc.Data qualified as Doc import Unison.Syntax.ReservedWords (delimiters, typeModifiers, typeOrAbility) import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) import Unison.Util.Bytes qualified as Bytes @@ -83,9 +74,6 @@ import Unison.Util.Monoid (intercalateMap) instance (Annotated a) => Annotated (Cofree f a) where ann (a :< _) = ann a -instance Annotated (Token a) where - ann (Token _ s e) = Ann s e - type BlockName = String type Layout = [(BlockName, Column)] @@ -158,7 +146,7 @@ data Lexeme | Bytes Bytes.Bytes -- bytes literals | Hash ShortHash -- hash literals | Err Err - | Doc (DocUntitledSection DocTree) + | Doc (Doc.UntitledSection DocTree) deriving stock (Eq, Show, Ord) type IsVirtual = Bool -- is it a virtual semi or an actual semi? @@ -422,6 +410,8 @@ restoreStack lbl p = do S.put (s2 {layout = layout1}) pure $ p <> closes +type DocTree = Cofree (Doc.Top [Token Lexeme]) Ann + -- | The `Doc` lexer as documented on unison-lang.org doc2 :: P [Token Lexeme] doc2 = do @@ -508,148 +498,18 @@ someTill' p end = liftA2 (:|) p $ P.manyTill p end sepBy1' :: P a -> P sep -> P (NonEmpty a) sepBy1' p sep = liftA2 (:|) p . many $ sep *> p -newtype DocUntitledSection a = DocUntitledSection [a] - deriving (Eq, Ord, Show, Foldable, Functor, Traversable) - --- | Haskell parallel to @unison/base.Doc@. --- --- This is much more restricted than @unison/base.Doc@, but it covers everything we can parse from Haskell. The --- mismatch with Unison is a problem, as someone can create a Unison Doc with explicit constructors or function calls, --- have it rendered to a scratch file, and then we can’t parse it. Changing the types here to match Unison wouldn’t --- fix the issue. We have to modify the types and parser in concert (in both Haskell and Unison) to bring them in --- line. --- --- __NB__: Uses of @[`Token` `Lexeme`]@ here indicate a nested transition to the Unison lexer. -data DocTop a - = -- | The first argument is always a Paragraph - DocSection a [a] - | DocEval [Token Lexeme] - | DocExampleBlock [Token Lexeme] - | DocCodeBlock (Token String) (Token String) - | DocBulletedList (NonEmpty (DocColumn a)) - | DocNumberedList (NonEmpty (Token Word64, DocColumn a)) - | DocParagraph (NonEmpty (DocLeaf a)) - deriving (Eq, Ord, Show, Foldable, Functor, Traversable) - -instance Eq1 DocTop where - liftEq _ _ _ = True - -instance Ord1 DocTop where - liftCompare _ _ _ = LT - -instance Show1 DocTop where - liftShowsPrec _ _ _ _ x = x - -data DocColumn a - = -- | The first is always a Paragraph, and the second a Bulleted or Numbered List - DocColumn a (Maybe a) - deriving (Eq, Ord, Show, Foldable, Functor, Traversable) - -data DocLeaf a - = DocLink DocEmbedLink - | -- | first is a Paragraph, second is always a Group (which contains either a single Term/Type link or list of - -- Transcludes & Words) - DocNamedLink a (DocLeaf Void) - | DocExample [Token Lexeme] - | DocTransclude [Token Lexeme] - | -- | Always a Paragraph - DocBold a - | -- | Always a Paragraph - DocItalic a - | -- | Always a Paragraph - DocStrikethrough a - | -- | Always a Word - DocVerbatim (DocLeaf Void) - | -- | Always a Word - DocCode (DocLeaf Void) - | DocSource (NonEmpty DocSourceElement) - | DocFoldedSource (NonEmpty DocSourceElement) - | DocEvalInline [Token Lexeme] - | DocSignature (NonEmpty DocEmbedSignatureLink) - | DocSignatureInline DocEmbedSignatureLink - | DocWord (Token String) - | DocGroup (DocJoin a) - deriving (Eq, Ord, Show, Foldable, Functor, Traversable) - -data DocEmbedLink - = DocEmbedTypeLink (Token (HQ'.HashQualified Name)) - | DocEmbedTermLink (Token (HQ'.HashQualified Name)) - deriving (Eq, Ord, Show) - -data DocSourceElement = DocSourceElement DocEmbedLink [DocEmbedAnnotation] - deriving (Eq, Ord, Show) - -newtype DocEmbedSignatureLink = DocEmbedSignatureLink (Token (HQ'.HashQualified Name)) - deriving (Eq, Ord, Show) - -newtype DocJoin a = DocJoin (NonEmpty (DocLeaf a)) - deriving (Eq, Ord, Show, Foldable, Functor, Traversable) - -newtype DocEmbedAnnotation - = -- | Always a DocTransclude - DocEmbedAnnotation (Either (Token (HQ'.HashQualified Name)) (DocLeaf Void)) - deriving (Eq, Ord, Show) - -type DocTree = Cofree DocTop Ann - -instance (Annotated a) => Annotated (DocTop a) where - ann = \case - DocSection title body -> ann title <> ann body - DocEval code -> ann code - DocExampleBlock code -> ann code - DocCodeBlock label body -> ann label <> ann body - DocBulletedList items -> ann items - DocNumberedList items -> ann $ snd <$> items - DocParagraph leaves -> ann leaves - -instance (Annotated a) => Annotated (DocColumn a) where - ann (DocColumn para list) = ann para <> ann list - -instance (Annotated a) => Annotated (DocLeaf a) where - ann = \case - DocLink link -> ann link - DocNamedLink label target -> ann label <> ann target - DocExample code -> ann code - DocTransclude code -> ann code - DocBold para -> ann para - DocItalic para -> ann para - DocStrikethrough para -> ann para - DocVerbatim word -> ann word - DocCode word -> ann word - DocSource elems -> ann elems - DocFoldedSource elems -> ann elems - DocEvalInline code -> ann code - DocSignature links -> ann links - DocSignatureInline link -> ann link - DocWord text -> ann text - DocGroup (DocJoin leaves) -> ann leaves - -instance Annotated DocEmbedLink where - ann = \case - DocEmbedTypeLink name -> ann name - DocEmbedTermLink name -> ann name - -instance Annotated DocSourceElement where - ann (DocSourceElement link target) = ann link <> ann target - -instance Annotated DocEmbedSignatureLink where - ann (DocEmbedSignatureLink name) = ann name - -instance Annotated DocEmbedAnnotation where - ann (DocEmbedAnnotation a) = either ann ann a - -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). -docBody :: P end -> P (DocUntitledSection DocTree) -docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) +docBody :: P end -> P (Doc.UntitledSection DocTree) +docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) where wordyKw kw = separated wordySep (lit kw) sectionElem = section <|> fencedBlock <|> list <|> paragraph - paragraph = wrap' . DocParagraph <$> spaced leaf + paragraph = wrap' . Doc.Paragraph <$> spaced leaf reserved word = List.isPrefixOf "}}" word || all (== '#') word - wordy :: P end -> P (DocLeaf void) - wordy closing = fmap DocWord . tokenP . P.try $ do + wordy :: P end -> P (Doc.Leaf [Token Lexeme] void) + wordy closing = fmap Doc.Word . tokenP . P.try $ do let end = P.lookAhead $ void docClose @@ -677,10 +537,10 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) where comma = lit "," <* CP.space src = - src' DocSource "@source" - <|> src' DocFoldedSource "@foldedSource" + src' Doc.Source "@source" + <|> src' Doc.FoldedSource "@foldedSource" srcElem = - DocSourceElement + Doc.SourceElement <$> (typeLink <|> termLink) <*> ( fmap (fromMaybe []) . P.optional $ (lit "@") *> (CP.space *> annotations) @@ -688,38 +548,38 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) where annotation = fmap Left (tokenP identifierP) <|> fmap Right expr <* CP.space annotations = - P.some (DocEmbedAnnotation <$> annotation) + P.some (Doc.EmbedAnnotation <$> annotation) src' name atName = fmap name $ do _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space s <- sepBy1' srcElem comma _ <- lit "}" pure s - signature = fmap DocSignature $ do + signature = fmap Doc.Signature $ do _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space s <- sepBy1' signatureLink comma _ <- lit "}" pure s - signatureInline = fmap DocSignatureInline $ do + signatureInline = fmap Doc.SignatureInline $ do _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space s <- signatureLink _ <- lit "}" pure s - evalInline = fmap DocEvalInline $ do + evalInline = fmap Doc.EvalInline $ do _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space let inlineEvalClose = [] <$ lit "}" s <- lexemes' inlineEvalClose pure s - typeLink = fmap DocEmbedTypeLink $ do + typeLink = fmap Doc.EmbedTypeLink $ do _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space tokenP identifierP <* CP.space termLink = - fmap DocEmbedTermLink $ + fmap Doc.EmbedTermLink $ tokenP identifierP <* CP.space signatureLink = - fmap DocEmbedSignatureLink $ + fmap Doc.EmbedSignatureLink $ tokenP identifierP <* CP.space groupy closing p = do @@ -728,8 +588,8 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) pure $ case after of Nothing -> p Just after -> - DocGroup - . DocJoin + Doc.Group + . Doc.Join $ p :| pure after @@ -748,17 +608,17 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed -- If it's a multi-line verbatim block we trim any whitespace representing -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - pure . DocVerbatim $ - DocWord $ + pure . Doc.Verbatim $ + Doc.Word $ Token txt start stop else - pure . DocCode $ - DocWord $ + pure . Doc.Code $ + Doc.Word $ Token originalText start stop exampleInline = P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ - fmap DocExample $ do + fmap Doc.Example $ do n <- P.try $ do _ <- lit "`" length <$> P.takeWhile1P (Just "backticks") (== '`') @@ -770,12 +630,12 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) link = P.label "link (examples: {type List}, {Nat.+})" $ - fmap DocLink $ + fmap Doc.Link $ P.try $ lit "{" *> (typeLink <|> termLink) <* lit "}" expr = - fmap DocTransclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ + fmap Doc.Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ openAs "{{" "syntax.docTransclude" *> do env0 <- S.get @@ -806,7 +666,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) P.label "block eval (syntax: a fenced code block)" $ evalUnison <|> exampleBlock <|> other where - evalUnison = fmap (wrap' . DocEval) $ do + evalUnison = fmap (wrap' . Doc.Eval) $ do -- commit after seeing that ``` is on its own line fence <- P.try $ do fence <- lit "```" <+> P.takeWhileP Nothing (== '`') @@ -817,7 +677,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) (\env -> env {inLayout = True, opening = Just "docEval"}) (restoreStack "docEval" $ lexemes' ([] <$ lit fence)) - exampleBlock = fmap (wrap' . DocExampleBlock) $ do + exampleBlock = fmap (wrap' . Doc.ExampleBlock) $ do void $ lit "@typecheck" <* CP.space fence <- lit "```" <+> P.takeWhileP Nothing (== '`') local @@ -834,7 +694,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) skip _ s = s in List.intercalate "\n" $ skip column <$> lines s - other = fmap (uncurry $ wrapSimple2 DocCodeBlock) $ do + other = fmap (uncurry $ wrapSimple2 Doc.CodeBlock) $ do column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel let tabWidth = toInteger . P.unPos $ P.defaultTabWidth fence <- lit "```" <+> P.takeWhileP Nothing (== '`') @@ -857,26 +717,26 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) (P.satisfy (== '~')) name s = if take 1 s == "~" - then DocStrikethrough - else if take 1 s == "*" then DocBold else DocItalic + then Doc.Strikethrough + else if take 1 s == "*" then Doc.Bold else Doc.Italic end <- P.try $ do end <- start P.lookAhead (P.satisfy (not . isSpace)) pure end - name end . wrap' . DocParagraph + name end . wrap' . Doc.Paragraph <$> someTill' (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) (lit end) externalLink = P.label "hyperlink (example: [link name](https://destination.com))" $ - fmap (uncurry DocNamedLink) $ do + fmap (uncurry Doc.NamedLink) $ do _ <- lit "[" p <- leafies (void $ char ']') _ <- lit "]" _ <- lit "(" target <- - fmap (DocGroup . DocJoin) $ + fmap (Doc.Group . Doc.Join) $ fmap pure link <|> some' (expr <|> wordy (char ')')) _ <- lit ")" pure (p, target) @@ -894,12 +754,12 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) ok s = length [() | '\n' <- s] < 2 spaced p = some' (p <* P.optional sp) - leafies close = wrap' . DocParagraph <$> spaced (leafy close) + leafies close = wrap' . Doc.Paragraph <$> spaced (leafy close) list = bulletedList <|> numberedList - bulletedList = wrap' . DocBulletedList <$> sepBy1' bullet listSep - numberedList = wrap' . DocNumberedList <$> sepBy1' numberedItem listSep + bulletedList = wrap' . Doc.BulletedList <$> sepBy1' bullet listSep + numberedList = wrap' . Doc.NumberedList <$> sepBy1' numberedItem listSep listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) @@ -921,7 +781,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) numberedStart = listItemStart' $ P.try (tokenP $ LP.decimal <* lit ".") - listItemParagraph = fmap (wrap' . DocParagraph) $ do + listItemParagraph = fmap (wrap' . Doc.Paragraph) $ do col <- column <$> posP some' (leaf <* sep col) where @@ -947,7 +807,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) numberedItem = P.label msg $ do (col, s) <- numberedStart (s,) - <$> ( fmap (uncurry DocColumn) $ do + <$> ( fmap (uncurry Doc.Column) $ do p <- nonNewlineSpaces *> listItemParagraph subList <- local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) @@ -956,7 +816,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) where msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" - bullet = fmap (uncurry DocColumn) . P.label "bullet (examples: * item1, - item2)" $ do + bullet = fmap (uncurry Doc.Column) . P.label "bullet (examples: * item1, - item2)" $ do (col, _) <- bulletedStart p <- nonNewlineSpaces *> listItemParagraph subList <- @@ -978,7 +838,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) -- # A section title (not a subsection) section :: P DocTree - section = fmap (wrap' . uncurry DocSection) $ do + section = fmap (wrap' . uncurry Doc.Section) $ do ns <- S.gets parentSections hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp title <- paragraph <* CP.space @@ -988,10 +848,10 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) P.many (sectionElem <* CP.space) pure $ (title, body) - wrap' :: DocTop DocTree -> DocTree + wrap' :: Doc.Top [Token Lexeme] DocTree -> DocTree wrap' doc = ann doc :< doc - wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> DocTop DocTree) -> a -> b -> DocTree + wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> Doc.Top [Token Lexeme] DocTree) -> a -> b -> DocTree wrapSimple2 fn a b = ann a <> ann b :< fn a b lexemes' :: P [Token Lexeme] -> P [Token Lexeme] diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Token.hs b/unison-syntax/src/Unison/Syntax/Lexer/Token.hs index 81842c409e..e29f276c5e 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Token.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Token.hs @@ -9,6 +9,7 @@ import Data.Text qualified as Text import Text.Megaparsec (ParsecT, TraversableStream) import Text.Megaparsec qualified as P import Unison.Lexer.Pos (Pos (Pos)) +import Unison.Parser.Ann (Ann (Ann), Annotated (..)) import Unison.Prelude data Token a = Token @@ -18,6 +19,9 @@ data Token a = Token } deriving stock (Eq, Ord, Show, Functor, Foldable, Traversable) +instance Annotated (Token a) where + ann (Token _ s e) = Ann s e + instance Applicative Token where pure a = Token a (Pos 0 0) (Pos 0 0) Token f start _ <*> Token a _ end = Token (f a) start end diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 344de0fd1b..1bee4d08f4 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -90,6 +90,7 @@ import Unison.Reference (Reference) import Unison.Referent (Referent) import Unison.Syntax.Lexer qualified as L import Unison.Syntax.Name qualified as Name (toVar, unsafeParseText) +import Unison.Syntax.Parser.Doc.Data qualified as Doc import Unison.Term (MatchCase (..)) import Unison.UnisonFile.Error qualified as UF import Unison.Util.Bytes (Bytes) @@ -270,7 +271,7 @@ closeBlock :: (Ord v) => P v m (L.Token ()) closeBlock = void <$> matchToken L.Close -- | With layout, blocks might “close” without an explicit outdent (e.g., not even a newline at the end of a --- `DocTransclude`). This allows those blocks to be closed by EOF. +-- `Doc.Transclude`). This allows those blocks to be closed by EOF. optionalCloseBlock :: (Ord v) => P v m (L.Token ()) optionalCloseBlock = closeBlock <|> (\() -> L.Token () mempty mempty) <$> P.eof @@ -399,7 +400,7 @@ string = queryToken getString getString (L.Textual s) = Just (Text.pack s) getString _ = Nothing -doc :: (Ord v) => P v m (L.Token (L.DocUntitledSection L.DocTree)) +doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection L.DocTree)) doc = queryToken \case L.Doc d -> pure d _ -> Nothing diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs new file mode 100644 index 0000000000..4a88200b8b --- /dev/null +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs @@ -0,0 +1,166 @@ +-- | Haskell parallel to @unison/base.Doc@. +-- +-- This is much more restricted than @unison/base.Doc@, but it covers everything we can parse from Haskell. The +-- mismatch with Unison is a problem, as someone can create a Unison Doc with explicit constructors or function calls, +-- have it rendered to a scratch file, and then we can’t parse it. Changing the types here to match Unison wouldn’t +-- fix the issue. We have to modify the types and parser in concert (in both Haskell and Unison) to bring them in +-- line. +module Unison.Syntax.Parser.Doc.Data where + +import Data.Functor.Classes +import Data.List.NonEmpty (NonEmpty) +import Unison.HashQualifiedPrime qualified as HQ' +import Unison.Name (Name) +import Unison.Parser.Ann (Annotated (..)) +import Unison.Prelude +import Unison.Syntax.Lexer.Token (Token (..)) + +newtype UntitledSection a = UntitledSection [a] + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +data Top code a + = -- | The first argument is always a Paragraph + Section a [a] + | Eval code + | ExampleBlock code + | CodeBlock (Token String) (Token String) + | BulletedList (NonEmpty (Column a)) + | NumberedList (NonEmpty (Token Word64, Column a)) + | Paragraph (NonEmpty (Leaf code a)) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +instance Eq2 Top where + liftEq2 _ _ _ _ = True + +instance (Eq code) => Eq1 (Top code) + +instance Ord2 Top where + liftCompare2 _ _ _ _ = LT + +instance (Ord code) => Ord1 (Top code) + +instance Show2 Top where + liftShowsPrec2 _ _ _ _ _ _ x = x + +instance (Show code) => Show1 (Top code) + +data Column a + = -- | The first is always a Paragraph, and the second a Bulleted or Numbered List + Column a (Maybe a) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +data Leaf code a + = Link EmbedLink + | -- | first is a Paragraph, second is always a Group (which contains either a single Term/Type link or list of + -- Transcludes & Words) + NamedLink a (Leaf code Void) + | Example code + | Transclude code + | -- | Always a Paragraph + Bold a + | -- | Always a Paragraph + Italic a + | -- | Always a Paragraph + Strikethrough a + | -- | Always a Word + Verbatim (Leaf Void Void) + | -- | Always a Word + Code (Leaf Void Void) + | Source (NonEmpty (SourceElement code)) + | FoldedSource (NonEmpty (SourceElement code)) + | EvalInline code + | Signature (NonEmpty EmbedSignatureLink) + | SignatureInline EmbedSignatureLink + | Word (Token String) + | Group (Join code a) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +instance Bifunctor Leaf where + bimap f g = \case + Link x -> Link x + NamedLink a leaf -> NamedLink (g a) $ first f leaf + Example code -> Example $ f code + Transclude code -> Transclude $ f code + Bold a -> Bold $ g a + Italic a -> Italic $ g a + Strikethrough a -> Strikethrough $ g a + Verbatim leaf -> Verbatim leaf + Code leaf -> Code leaf + Source elems -> Source $ fmap f <$> elems + FoldedSource elems -> FoldedSource $ fmap f <$> elems + EvalInline code -> EvalInline $ f code + Signature x -> Signature x + SignatureInline x -> SignatureInline x + Word x -> Word x + Group join -> Group $ bimap f g join + +data EmbedLink + = EmbedTypeLink (Token (HQ'.HashQualified Name)) + | EmbedTermLink (Token (HQ'.HashQualified Name)) + deriving (Eq, Ord, Show) + +data SourceElement code = SourceElement EmbedLink [EmbedAnnotation code] + deriving (Eq, Ord, Show, Functor) + +newtype EmbedSignatureLink = EmbedSignatureLink (Token (HQ'.HashQualified Name)) + deriving (Eq, Ord, Show) + +newtype Join code a = Join (NonEmpty (Leaf code a)) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +instance Bifunctor Join where + bimap f g (Join leaves) = Join $ bimap f g <$> leaves + +newtype EmbedAnnotation code + = -- | Always a Transclude + EmbedAnnotation (Either (Token (HQ'.HashQualified Name)) (Leaf code Void)) + deriving (Eq, Ord, Show) + +instance Functor EmbedAnnotation where + fmap f (EmbedAnnotation ann) = EmbedAnnotation $ first f <$> ann + +instance (Annotated code, Annotated a) => Annotated (Top code a) where + ann = \case + Section title body -> ann title <> ann body + Eval code -> ann code + ExampleBlock code -> ann code + CodeBlock label body -> ann label <> ann body + BulletedList items -> ann items + NumberedList items -> ann $ snd <$> items + Paragraph leaves -> ann leaves + +instance (Annotated a) => Annotated (Column a) where + ann (Column para list) = ann para <> ann list + +instance (Annotated code, Annotated a) => Annotated (Leaf code a) where + ann = \case + Link link -> ann link + NamedLink label target -> ann label <> ann target + Example code -> ann code + Transclude code -> ann code + Bold para -> ann para + Italic para -> ann para + Strikethrough para -> ann para + Verbatim word -> ann word + Code word -> ann word + Source elems -> ann elems + FoldedSource elems -> ann elems + EvalInline code -> ann code + Signature links -> ann links + SignatureInline link -> ann link + Word text -> ann text + Group (Join leaves) -> ann leaves + +instance Annotated EmbedLink where + ann = \case + EmbedTypeLink name -> ann name + EmbedTermLink name -> ann name + +instance (Annotated code) => Annotated (SourceElement code) where + ann (SourceElement link target) = ann link <> ann target + +instance Annotated EmbedSignatureLink where + ann (EmbedSignatureLink name) = ann name + +instance (Annotated code) => Annotated (EmbedAnnotation code) where + ann (EmbedAnnotation a) = either ann ann a diff --git a/unison-syntax/unison-syntax.cabal b/unison-syntax/unison-syntax.cabal index 4b097e6021..31ee026b7c 100644 --- a/unison-syntax/unison-syntax.cabal +++ b/unison-syntax/unison-syntax.cabal @@ -26,6 +26,7 @@ library Unison.Syntax.Name Unison.Syntax.NameSegment Unison.Syntax.Parser + Unison.Syntax.Parser.Doc.Data Unison.Syntax.ReservedWords Unison.Syntax.ShortHash Unison.Syntax.Var From 70fe615570ed57026fb58b8939a643d9155e58b0 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 25 Jul 2024 14:33:40 -0600 Subject: [PATCH 25/50] Add `Data.Functor.Classes` instances These are needed for the new Doc types, but had been stubbed out. Moving the Doc types to their own module forced the changes that got in the way of generating these with Template Haskell. --- .../src/Unison/Syntax/TermParser.hs | 4 +- unison-syntax/package.yaml | 1 + .../src/Unison/Syntax/Parser/Doc/Data.hs | 90 +++++++++++-------- unison-syntax/unison-syntax.cabal | 2 + 4 files changed, 57 insertions(+), 40 deletions(-) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 89d5504079..6433bf220c 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -598,7 +598,7 @@ doc2Block = do Doc.EmbedTermLink ident -> Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docSourceElement :: Doc.SourceElement [L.Token L.Lexeme] -> TermP v m + docSourceElement :: Doc.SourceElement (Doc.Leaf [L.Token L.Lexeme] Void) -> TermP v m docSourceElement d@(Doc.SourceElement link anns) = do link' <- docEmbedLink link anns' <- traverse docEmbedAnnotation anns @@ -608,7 +608,7 @@ doc2Block = do docEmbedSignatureLink d@(Doc.EmbedSignatureLink ident) = Term.app (gann d) (f d "EmbedSignatureLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docEmbedAnnotation :: Doc.EmbedAnnotation [L.Token L.Lexeme] -> TermP v m + docEmbedAnnotation :: Doc.EmbedAnnotation (Doc.Leaf [L.Token L.Lexeme] Void) -> TermP v m docEmbedAnnotation d@(Doc.EmbedAnnotation a) = -- This is the only place I’m not sure we’re doing the right thing. In the lexer, this can be an identifier or a -- DocLeaf, but here it could be either /text/ or a Doc element. And I don’t think there’s any way the lexemes diff --git a/unison-syntax/package.yaml b/unison-syntax/package.yaml index ccb1a057d7..b093dc182f 100644 --- a/unison-syntax/package.yaml +++ b/unison-syntax/package.yaml @@ -9,6 +9,7 @@ dependencies: - bytes - containers - cryptonite + - deriving-compat - extra - free - lens diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs index 4a88200b8b..5167b2bcf6 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE TemplateHaskell #-} + -- | Haskell parallel to @unison/base.Doc@. -- -- This is much more restricted than @unison/base.Doc@, but it covers everything we can parse from Haskell. The @@ -7,8 +9,10 @@ -- line. module Unison.Syntax.Parser.Doc.Data where -import Data.Functor.Classes +import Data.Eq.Deriving (deriveEq1, deriveEq2) import Data.List.NonEmpty (NonEmpty) +import Data.Ord.Deriving (deriveOrd1, deriveOrd2) +import Text.Show.Deriving (deriveShow1, deriveShow2) import Unison.HashQualifiedPrime qualified as HQ' import Unison.Name (Name) import Unison.Parser.Ann (Annotated (..)) @@ -19,7 +23,7 @@ newtype UntitledSection a = UntitledSection [a] deriving (Eq, Ord, Show, Foldable, Functor, Traversable) data Top code a - = -- | The first argument is always a Paragraph + = -- | The first argument is always a `Paragraph` Section a [a] | Eval code | ExampleBlock code @@ -29,30 +33,15 @@ data Top code a | Paragraph (NonEmpty (Leaf code a)) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -instance Eq2 Top where - liftEq2 _ _ _ _ = True - -instance (Eq code) => Eq1 (Top code) - -instance Ord2 Top where - liftCompare2 _ _ _ _ = LT - -instance (Ord code) => Ord1 (Top code) - -instance Show2 Top where - liftShowsPrec2 _ _ _ _ _ _ x = x - -instance (Show code) => Show1 (Top code) - data Column a - = -- | The first is always a Paragraph, and the second a Bulleted or Numbered List + = -- | The first is always a `Paragraph`, and the second a `BulletedList` or `NumberedList` Column a (Maybe a) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) data Leaf code a = Link EmbedLink | -- | first is a Paragraph, second is always a Group (which contains either a single Term/Type link or list of - -- Transcludes & Words) + -- `Transclude`s & `Word`s) NamedLink a (Leaf code Void) | Example code | Transclude code @@ -66,13 +55,15 @@ data Leaf code a Verbatim (Leaf Void Void) | -- | Always a Word Code (Leaf Void Void) - | Source (NonEmpty (SourceElement code)) - | FoldedSource (NonEmpty (SourceElement code)) + | -- | Always a Transclude + Source (NonEmpty (SourceElement (Leaf code Void))) + | -- | Always a Transclude + FoldedSource (NonEmpty (SourceElement (Leaf code Void))) | EvalInline code | Signature (NonEmpty EmbedSignatureLink) | SignatureInline EmbedSignatureLink | Word (Token String) - | Group (Join code a) + | Group (Join (Leaf code a)) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) instance Bifunctor Leaf where @@ -86,38 +77,31 @@ instance Bifunctor Leaf where Strikethrough a -> Strikethrough $ g a Verbatim leaf -> Verbatim leaf Code leaf -> Code leaf - Source elems -> Source $ fmap f <$> elems - FoldedSource elems -> FoldedSource $ fmap f <$> elems + Source elems -> Source $ fmap (first f) <$> elems + FoldedSource elems -> FoldedSource $ fmap (first f) <$> elems EvalInline code -> EvalInline $ f code Signature x -> Signature x SignatureInline x -> SignatureInline x Word x -> Word x - Group join -> Group $ bimap f g join + Group join -> Group $ bimap f g <$> join data EmbedLink = EmbedTypeLink (Token (HQ'.HashQualified Name)) | EmbedTermLink (Token (HQ'.HashQualified Name)) deriving (Eq, Ord, Show) -data SourceElement code = SourceElement EmbedLink [EmbedAnnotation code] - deriving (Eq, Ord, Show, Functor) +data SourceElement a = SourceElement EmbedLink [EmbedAnnotation a] + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) newtype EmbedSignatureLink = EmbedSignatureLink (Token (HQ'.HashQualified Name)) deriving (Eq, Ord, Show) -newtype Join code a = Join (NonEmpty (Leaf code a)) +newtype Join a = Join (NonEmpty a) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -instance Bifunctor Join where - bimap f g (Join leaves) = Join $ bimap f g <$> leaves - -newtype EmbedAnnotation code - = -- | Always a Transclude - EmbedAnnotation (Either (Token (HQ'.HashQualified Name)) (Leaf code Void)) - deriving (Eq, Ord, Show) - -instance Functor EmbedAnnotation where - fmap f (EmbedAnnotation ann) = EmbedAnnotation $ first f <$> ann +newtype EmbedAnnotation a + = EmbedAnnotation (Either (Token (HQ'.HashQualified Name)) a) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) instance (Annotated code, Annotated a) => Annotated (Top code a) where ann = \case @@ -164,3 +148,33 @@ instance Annotated EmbedSignatureLink where instance (Annotated code) => Annotated (EmbedAnnotation code) where ann (EmbedAnnotation a) = either ann ann a + +$(deriveEq1 ''Column) +$(deriveOrd1 ''Column) +$(deriveShow1 ''Column) + +$(deriveEq1 ''EmbedAnnotation) +$(deriveOrd1 ''EmbedAnnotation) +$(deriveShow1 ''EmbedAnnotation) + +$(deriveEq1 ''SourceElement) +$(deriveOrd1 ''SourceElement) +$(deriveShow1 ''SourceElement) + +$(deriveEq1 ''Join) +$(deriveOrd1 ''Join) +$(deriveShow1 ''Join) + +$(deriveEq1 ''Leaf) +$(deriveOrd1 ''Leaf) +$(deriveShow1 ''Leaf) +$(deriveEq2 ''Leaf) +$(deriveOrd2 ''Leaf) +$(deriveShow2 ''Leaf) + +$(deriveEq1 ''Top) +$(deriveOrd1 ''Top) +$(deriveShow1 ''Top) +$(deriveEq2 ''Top) +$(deriveOrd2 ''Top) +$(deriveShow2 ''Top) diff --git a/unison-syntax/unison-syntax.cabal b/unison-syntax/unison-syntax.cabal index 31ee026b7c..853da4c817 100644 --- a/unison-syntax/unison-syntax.cabal +++ b/unison-syntax/unison-syntax.cabal @@ -69,6 +69,7 @@ library , bytes , containers , cryptonite + , deriving-compat , extra , free , lens @@ -127,6 +128,7 @@ test-suite syntax-tests , code-page , containers , cryptonite + , deriving-compat , easytest , extra , free From 31f952201c5a1448af0eb78b5a9dbff63f12c05f Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 25 Jul 2024 15:59:51 -0600 Subject: [PATCH 26/50] Simplify `restoreStack` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It’s only used inside `local`, so its attempts to restore the layout are for naught. --- unison-syntax/src/Unison/Syntax/Lexer.hs | 62 ++++++------------------ 1 file changed, 15 insertions(+), 47 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index c0d1c3c04c..e2cba29dc4 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -257,7 +257,7 @@ token'' tok p = do topHasClosePair :: Layout -> Bool topHasClosePair [] = False topHasClosePair ((name, _) : _) = - name `elem` ["syntax.docTransclude", "{", "(", "[", "handle", "match", "if", "then"] + name `elem` ["DUMMY", "{", "(", "[", "handle", "match", "if", "then"] showErrorFancy :: (P.ShowErrorComponent e) => P.ErrorFancy e -> String showErrorFancy = \case @@ -394,22 +394,6 @@ infixl 2 <+> (<+>) :: (Monoid a) => P a -> P a -> P a p1 <+> p2 = do a1 <- p1; a2 <- p2; pure (a1 <> a2) --- Runs the parser `p`, then: --- 1. resets the layout stack to be what it was before `p`. --- 2. emits enough closing tokens to reach `lbl` but not pop it. --- (you can think of this as just dealing with a final "unclosed" --- block at the end of `p`) -restoreStack :: String -> P [Token Lexeme] -> P [Token Lexeme] -restoreStack lbl p = do - layout1 <- S.gets layout - p <- p - s2 <- S.get - let (pos1, pos2) = foldl' (\_ b -> (start b, end b)) mempty p - unclosed = takeWhile (\(lbl', _) -> lbl' /= lbl) (layout s2) - closes = replicate (length unclosed) (Token Close pos1 pos2) - S.put (s2 {layout = layout1}) - pure $ p <> closes - type DocTree = Cofree (Doc.Top [Token Lexeme]) Ann -- | The `Doc` lexer as documented on unison-lang.org @@ -501,7 +485,7 @@ sepBy1' p sep = liftA2 (:|) p . many $ sep *> p -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). docBody :: P end -> P (Doc.UntitledSection DocTree) -docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) +docBody docClose = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) where wordyKw kw = separated wordySep (lit kw) sectionElem = section <|> fencedBlock <|> list <|> paragraph @@ -626,8 +610,6 @@ docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) ex <- CP.space *> lexemes' end pure ex - docClose = [] <$ docClose' - link = P.label "link (examples: {type List}, {Nat.+})" $ fmap Doc.Link $ @@ -636,20 +618,7 @@ docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) expr = fmap Doc.Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ - openAs "{{" "syntax.docTransclude" - *> do - env0 <- S.get - -- we re-allow layout within a transclusion, then restore it to its - -- previous state after - S.put (env0 {inLayout = True}) - -- Note: this P.lookAhead ensures the }} isn't consumed, - -- so it can be consumed below by the `close` which will - -- pop items off the layout stack up to the nearest enclosing - -- syntax.docTransclude. - ts <- lexemes' (P.lookAhead ([] <$ lit "}}")) - S.modify (\env -> env {inLayout = inLayout env0}) - pure ts - <* close ["syntax.docTransclude"] (lit "}}") + lit "{{" *> lexemes' ([] <$ lit "}}") nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace @@ -673,16 +642,12 @@ docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) fence <$ guard b CP.space - *> local - (\env -> env {inLayout = True, opening = Just "docEval"}) - (restoreStack "docEval" $ lexemes' ([] <$ lit fence)) + *> lexemes' ([] <$ lit fence) exampleBlock = fmap (wrap' . Doc.ExampleBlock) $ do void $ lit "@typecheck" <* CP.space fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - local - (\env -> env {inLayout = True, opening = Just "docExampleBlock"}) - (restoreStack "docExampleBlock" $ lexemes' ([] <$ lit fence)) + lexemes' $ [] <$ lit fence uncolumn column tabWidth s = let skip col r | col < 1 = r @@ -855,10 +820,16 @@ docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) wrapSimple2 fn a b = ann a <> ann b :< fn a b lexemes' :: P [Token Lexeme] -> P [Token Lexeme] -lexemes' = +lexemes' eof = -- NB: `postLex` requires the token stream to start with an `Open`, otherwise it can’t create a `T`, so this adds one, -- runs `postLex`, then removes it. - fmap (tail . postLex . (Token (Open "fake") mempty mempty :)) . lexemes + fmap (tail . postLex . (Token (Open "fake") mempty mempty :)) $ + local (\env -> env {inLayout = True, opening = Just "DUMMY"}) do + p <- lexemes eof + -- deals with a final "unclosed" block at the end of `p`) + unclosed <- takeWhile (("DUMMY" /=) . fst) . layout <$> S.get + let pos = end $ last p + pure $ p <> replicate (length unclosed) (Token Close pos pos) -- | Consumes an entire Unison “module”. lexemes :: P [Token Lexeme] -> P [Token Lexeme] @@ -1245,11 +1216,8 @@ separated :: (Char -> Bool) -> P a -> P a separated ok p = P.try $ p <* P.lookAhead (void (P.satisfy ok) <|> P.eof) open :: String -> P [Token Lexeme] -open b = openAs b b - -openAs :: String -> String -> P [Token Lexeme] -openAs syntax b = do - token <- tokenP $ lit syntax +open b = do + token <- tokenP $ lit b env <- S.get S.put (env {opening = Just b}) pure [Open b <$ token] From 6f2d188e5c8a0edae756046c7adaa0fbd9581407 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 25 Jul 2024 14:35:26 -0600 Subject: [PATCH 27/50] Split Doc parser from Unison lexer --- parser-typechecker/src/Unison/PrintError.hs | 2 +- .../src/Unison/Syntax/TermParser.hs | 2 +- .../src/Unison/Syntax/TermPrinter.hs | 2 +- .../src/Unison/Codebase/Editor/HandleInput.hs | 5 +- unison-cli/src/Unison/LSP/FileAnalysis.hs | 2 +- unison-cli/src/Unison/LSP/Types.hs | 2 +- unison-syntax/src/Unison/Syntax/Lexer.hs | 1354 +---------------- .../src/Unison/Syntax/Lexer/Unison.hs | 910 +++++++++++ unison-syntax/src/Unison/Syntax/Parser.hs | 5 +- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 476 ++++++ unison-syntax/test/Main.hs | 2 +- unison-syntax/unison-syntax.cabal | 2 + 12 files changed, 1430 insertions(+), 1334 deletions(-) create mode 100644 unison-syntax/src/Unison/Syntax/Lexer/Unison.hs create mode 100644 unison-syntax/src/Unison/Syntax/Parser/Doc.hs diff --git a/parser-typechecker/src/Unison/PrintError.hs b/parser-typechecker/src/Unison/PrintError.hs index 8b73b179f1..dd796c0159 100644 --- a/parser-typechecker/src/Unison/PrintError.hs +++ b/parser-typechecker/src/Unison/PrintError.hs @@ -54,7 +54,7 @@ import Unison.Result qualified as Result import Unison.Settings qualified as Settings import Unison.Symbol (Symbol) import Unison.Syntax.HashQualified qualified as HQ (toText) -import Unison.Syntax.Lexer qualified as L +import Unison.Syntax.Lexer.Unison qualified as L import Unison.Syntax.Name qualified as Name (toText) import Unison.Syntax.NamePrinter (prettyHashQualified0) import Unison.Syntax.Parser (Annotated, ann) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 6433bf220c..4c3069b9ff 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -47,7 +47,7 @@ import Unison.Pattern qualified as Pattern import Unison.Prelude import Unison.Reference (Reference) import Unison.Referent (Referent) -import Unison.Syntax.Lexer qualified as L +import Unison.Syntax.Lexer.Unison qualified as L import Unison.Syntax.Name qualified as Name (toText, toVar, unsafeParseVar) import Unison.Syntax.NameSegment qualified as NameSegment import Unison.Syntax.Parser hiding (seq) diff --git a/parser-typechecker/src/Unison/Syntax/TermPrinter.hs b/parser-typechecker/src/Unison/Syntax/TermPrinter.hs index faeda76020..5c41701bf8 100644 --- a/parser-typechecker/src/Unison/Syntax/TermPrinter.hs +++ b/parser-typechecker/src/Unison/Syntax/TermPrinter.hs @@ -51,7 +51,7 @@ import Unison.Reference qualified as Reference import Unison.Referent (Referent) import Unison.Referent qualified as Referent import Unison.Syntax.HashQualified qualified as HQ (unsafeFromVar) -import Unison.Syntax.Lexer (showEscapeChar) +import Unison.Syntax.Lexer.Unison (showEscapeChar) import Unison.Syntax.Name qualified as Name (isSymboly, parseText, parseTextEither, toText, unsafeParseText) import Unison.Syntax.NamePrinter (styleHashQualified'') import Unison.Syntax.NameSegment qualified as NameSegment (toEscapedText) diff --git a/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs b/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs index e85879cc4a..e17d3fdd9e 100644 --- a/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs +++ b/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs @@ -151,8 +151,7 @@ import Unison.ShortHash qualified as SH import Unison.Sqlite qualified as Sqlite import Unison.Symbol (Symbol) import Unison.Syntax.HashQualified qualified as HQ (parseTextWith, toText) -import Unison.Syntax.Lexer qualified as L -import Unison.Syntax.Lexer qualified as Lexer +import Unison.Syntax.Lexer.Unison qualified as L import Unison.Syntax.Name qualified as Name (toText, toVar, unsafeParseVar) import Unison.Syntax.NameSegment qualified as NameSegment import Unison.Syntax.Parser qualified as Parser @@ -1137,7 +1136,7 @@ handleFindI isVerbose fscope ws input = do -- name query qs -> do - let anythingBeforeHash :: Megaparsec.Parsec (Lexer.Token Text) [Char] Text + let anythingBeforeHash :: Megaparsec.Parsec (L.Token Text) [Char] Text anythingBeforeHash = Text.pack <$> Megaparsec.takeWhileP Nothing (/= '#') let srs = searchBranchScored diff --git a/unison-cli/src/Unison/LSP/FileAnalysis.hs b/unison-cli/src/Unison/LSP/FileAnalysis.hs index 7a7ae006cf..bec9f8bf9f 100644 --- a/unison-cli/src/Unison/LSP/FileAnalysis.hs +++ b/unison-cli/src/Unison/LSP/FileAnalysis.hs @@ -57,7 +57,7 @@ import Unison.Result (Note) import Unison.Result qualified as Result import Unison.Symbol (Symbol) import Unison.Syntax.HashQualifiedPrime qualified as HQ' (toText) -import Unison.Syntax.Lexer qualified as L +import Unison.Syntax.Lexer.Unison qualified as L import Unison.Syntax.Name qualified as Name import Unison.Syntax.Parser qualified as Parser import Unison.Syntax.TypePrinter qualified as TypePrinter diff --git a/unison-cli/src/Unison/LSP/Types.hs b/unison-cli/src/Unison/LSP/Types.hs index b368e915ef..268034ea5a 100644 --- a/unison-cli/src/Unison/LSP/Types.hs +++ b/unison-cli/src/Unison/LSP/Types.hs @@ -41,7 +41,7 @@ import Unison.Server.Backend qualified as Backend import Unison.Server.NameSearch (NameSearch) import Unison.Sqlite qualified as Sqlite import Unison.Symbol -import Unison.Syntax.Lexer qualified as Lexer +import Unison.Syntax.Lexer.Unison qualified as Lexer import Unison.Type (Type) import Unison.UnisonFile qualified as UF import Unison.UnisonFile.Summary (FileSummary (..)) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index e2cba29dc4..cfd932cd7e 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -1,21 +1,13 @@ {-# LANGUAGE TemplateHaskell #-} {-# OPTIONS_GHC -Wno-orphans #-} +-- | This currently contains a mix of general lexing utilities and identifier-y lexers. module Unison.Syntax.Lexer ( Token (..), Line, Column, Err (..), Pos (..), - Lexeme (..), - DocTree, - lexer, - preParse, - escapeChars, - debugFilePreParse, - debugPreParse, - debugPreParse', - showEscapeChar, touches, -- * Character classifiers @@ -23,28 +15,40 @@ module Unison.Syntax.Lexer wordyIdStartChar, symbolyIdChar, - -- * Error formatting - formatTrivialError, - displayLexeme, + -- * new exports + BlockName, + Layout, + ParsingEnv (..), + P, + local, + parseFailure, + space, + lit, + err, + commitAfter2, + (<+>), + some', + someTill', + sepBy1', + separated, + wordySep, + identifierP, + wordyIdSegP, + shortHashP, + topBlockName, + pop, + typeOrAbilityAlt, + typeModifiersAlt, + inc, ) where import Control.Comonad.Cofree (Cofree ((:<))) -import Control.Lens qualified as Lens import Control.Monad.State qualified as S -import Data.Char (isAlphaNum, isControl, isDigit, isSpace, ord, toLower) -import Data.Foldable qualified as Foldable -import Data.List qualified as List -import Data.List.Extra qualified as List +import Data.Char (isSpace) import Data.List.NonEmpty (NonEmpty ((:|))) -import Data.List.NonEmpty qualified as Nel -import Data.List.NonEmpty qualified as NonEmpty -import Data.Map.Strict qualified as Map -import Data.Set qualified as Set import Data.Text qualified as Text -import GHC.Exts (sortWith) import Text.Megaparsec qualified as P -import Text.Megaparsec.Char (char) import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Text.Megaparsec.Error qualified as EP @@ -52,24 +56,16 @@ import Text.Megaparsec.Internal qualified as PI import Unison.HashQualifiedPrime qualified as HQ' import Unison.Lexer.Pos (Column, Line, Pos (Pos), column, line) import Unison.Name (Name) -import Unison.Name qualified as Name import Unison.NameSegment (NameSegment) -import Unison.NameSegment qualified as NameSegment (docSegment) -import Unison.NameSegment.Internal qualified as NameSegment -import Unison.Parser.Ann (Ann, Annotated (..)) +import Unison.Parser.Ann (Annotated (..)) import Unison.Prelude import Unison.ShortHash (ShortHash) -import Unison.ShortHash qualified as SH -import Unison.Syntax.HashQualifiedPrime qualified as HQ' (toText) -import Unison.Syntax.Lexer.Token (Token (..), posP, tokenP) -import Unison.Syntax.Name qualified as Name (isSymboly, nameP, toText, unsafeParseText) +import Unison.Syntax.Lexer.Token (Token (..), posP) +import Unison.Syntax.Name qualified as Name (nameP) import Unison.Syntax.NameSegment (symbolyIdChar, wordyIdChar, wordyIdStartChar) import Unison.Syntax.NameSegment qualified as NameSegment (ParseErr (..), wordyP) -import Unison.Syntax.Parser.Doc.Data qualified as Doc -import Unison.Syntax.ReservedWords (delimiters, typeModifiers, typeOrAbility) +import Unison.Syntax.ReservedWords (typeModifiers, typeOrAbility) import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) -import Unison.Util.Bytes qualified as Bytes -import Unison.Util.Monoid (intercalateMap) instance (Annotated a) => Annotated (Cofree f a) where ann (a :< _) = ann a @@ -128,29 +124,6 @@ data Err | UnexpectedTokens String -- Catch-all for all other lexer errors, representing some unexpected tokens. deriving stock (Eq, Ord, Show) -- richer algebra --- Design principle: --- `[Lexeme]` should be sufficient information for parsing without --- further knowledge of spacing or indentation levels --- any knowledge of comments -data Lexeme - = Open String -- start of a block - | Semi IsVirtual -- separator between elements of a block - | Close -- end of a block - | Reserved String -- reserved tokens such as `{`, `(`, `type`, `of`, etc - | Textual String -- text literals, `"foo bar"` - | Character Char -- character literals, `?X` - | WordyId (HQ'.HashQualified Name) -- a (non-infix) identifier. invariant: last segment is wordy - | SymbolyId (HQ'.HashQualified Name) -- an infix identifier. invariant: last segment is symboly - | Blank String -- a typed hole or placeholder - | Numeric String -- numeric literals, left unparsed - | Bytes Bytes.Bytes -- bytes literals - | Hash ShortHash -- hash literals - | Err Err - | Doc (Doc.UntitledSection DocTree) - deriving stock (Eq, Show, Ord) - -type IsVirtual = Bool -- is it a virtual semi or an actual semi? - space :: P () space = LP.space @@ -163,15 +136,6 @@ space = lit :: String -> P String lit = P.try . LP.symbol (pure ()) -token :: P Lexeme -> P [Token Lexeme] -token = token' (\a start end -> [Token a start end]) - --- Token parser: strips trailing whitespace and comments after a --- successful parse, and also takes care of emitting layout tokens --- (such as virtual semicolons and closing tokens). -token' :: (a -> Pos -> Pos -> [Token Lexeme]) -> P a -> P [Token Lexeme] -token' tok p = LP.lexeme space (token'' tok p) - -- Committed failure err :: Pos -> Err -> P x err start t = do @@ -193,283 +157,11 @@ commitAfter2 a b f = do (a, b) <- P.try $ liftA2 (,) a b f a b --- Token parser implementation which leaves trailing whitespace and comments --- but does emit layout tokens such as virtual semicolons and closing tokens. -token'' :: (a -> Pos -> Pos -> [Token Lexeme]) -> P a -> P [Token Lexeme] -token'' tok p = do - start <- posP - -- We save the current state so we can backtrack the state if `p` fails. - env <- S.get - layoutToks <- case opening env of - -- If we're opening a block named b, we push (b, currentColumn) onto - -- the layout stack. Example: - -- - -- blah = cases - -- {- A comment -} - -- -- A one-line comment - -- 0 -> "hi" - -- 1 -> "bye" - -- - -- After the `cases` token, the state will be opening = Just "cases", - -- meaning the parser is searching for the next non-whitespace/comment - -- character to determine the leftmost column of the `cases` block. - -- That will be the column of the `0`. - Just blockname -> - -- special case - handling of empty blocks, as in: - -- foo = - -- bar = 42 - if blockname == "=" && column start <= top l && not (null l) - then do - S.put (env {layout = (blockname, column start + 1) : l, opening = Nothing}) - pops start - else [] <$ S.put (env {layout = layout', opening = Nothing}) - where - layout' = (blockname, column start) : l - l = layout env - -- If we're not opening a block, we potentially pop from - -- the layout stack and/or emit virtual semicolons. - Nothing -> if inLayout env then pops start else pure [] - beforeTokenPos <- posP - a <- p <|> (S.put env >> fail "resetting state") - endPos <- posP - pure $ layoutToks ++ tok a beforeTokenPos endPos - where - pops :: Pos -> P [Token Lexeme] - pops p = do - env <- S.get - let l = layout env - if top l == column p && topContainsVirtualSemis l - then pure [Token (Semi True) p p] - else - if column p > top l || topHasClosePair l - then pure [] - else - if column p < top l - then S.put (env {layout = pop l}) >> ((Token Close p p :) <$> pops p) - else error "impossible" - - -- don't emit virtual semis in (, {, or [ blocks - topContainsVirtualSemis :: Layout -> Bool - topContainsVirtualSemis = \case - [] -> False - ((name, _) : _) -> name /= "(" && name /= "{" && name /= "[" - - topHasClosePair :: Layout -> Bool - topHasClosePair [] = False - topHasClosePair ((name, _) : _) = - name `elem` ["DUMMY", "{", "(", "[", "handle", "match", "if", "then"] - -showErrorFancy :: (P.ShowErrorComponent e) => P.ErrorFancy e -> String -showErrorFancy = \case - P.ErrorFail msg -> msg - P.ErrorIndentation ord ref actual -> - "incorrect indentation (got " - <> show (P.unPos actual) - <> ", should be " - <> p - <> show (P.unPos ref) - <> ")" - where - p = case ord of - LT -> "less than " - EQ -> "equal to " - GT -> "greater than " - P.ErrorCustom a -> P.showErrorComponent a - -lexer :: String -> String -> [Token Lexeme] -lexer scope rem = - case flip S.evalState env0 $ P.runParserT (lexemes eof) scope rem of - Left e -> - let errsWithSourcePos = - fst $ - P.attachSourcePos - P.errorOffset - (toList (P.bundleErrors e)) - (P.bundlePosState e) - errorToTokens :: (EP.ParseError String (Token Err), P.SourcePos) -> [Token Lexeme] - errorToTokens (err, top) = case err of - P.FancyError _ (customErrs -> es) | not (null es) -> es - P.FancyError _errOffset es -> - let msg = intercalateMap "\n" showErrorFancy es - in [Token (Err (UnexpectedTokens msg)) (toPos top) (toPos top)] - P.TrivialError _errOffset mayUnexpectedTokens expectedTokens -> - let unexpectedStr :: Set String - unexpectedStr = - mayUnexpectedTokens - & fmap errorItemToString - & maybeToList - & Set.fromList - errorLength :: Int - errorLength = case Set.toList unexpectedStr of - [] -> 0 - (x : _) -> length x - expectedStr :: Set String - expectedStr = - expectedTokens - & Set.map errorItemToString - err = UnexpectedTokens $ formatTrivialError unexpectedStr expectedStr - startPos = toPos top - -- This is just an attempt to highlight errors better in source excerpts. - -- It may not work in all cases, but should generally provide a better experience. - endPos = startPos & \(Pos l c) -> Pos l (c + errorLength) - in [Token (Err err) startPos endPos] - in errsWithSourcePos >>= errorToTokens - Right ts -> postLex $ Token (Open scope) topLeftCorner topLeftCorner : ts - where - eof :: P [Token Lexeme] - eof = P.try do - p <- P.eof >> posP - n <- maybe 0 (const 1) <$> S.gets opening - l <- S.gets layout - pure $ replicate (length l + n) (Token Close p p) - errorItemToString :: EP.ErrorItem Char -> String - errorItemToString = \case - (P.Tokens ts) -> Foldable.toList ts - (P.Label ts) -> Foldable.toList ts - (P.EndOfInput) -> "end of input" - customErrs es = [Err <$> e | P.ErrorCustom e <- toList es] - toPos (P.SourcePos _ line col) = Pos (P.unPos line) (P.unPos col) - env0 = ParsingEnv [] (Just scope) True [0] 0 - --- | hacky postprocessing pass to do some cleanup of stuff that's annoying to --- fix without adding more state to the lexer: --- - 1+1 lexes as [1, +1], convert this to [1, +, 1] --- - when a semi followed by a virtual semi, drop the virtual, lets you --- write --- foo x = action1; --- 2 --- - semi immediately after first Open is ignored -tweak :: (Token Lexeme) -> [Token Lexeme] -> [Token Lexeme] -tweak h@(Token (Semi False) _ _) (Token (Semi True) _ _ : t) = h : t --- __NB__: This case only exists to guard against the following one -tweak h@(Token (Reserved _) _ _) t = h : t -tweak t1 (t2@(Token (Numeric num) _ _) : rem) - | notLayout t1 && touches t1 t2 && isSigned num = - t1 - : Token - (SymbolyId (HQ'.fromName (Name.unsafeParseText (Text.pack (take 1 num))))) - (start t2) - (inc $ start t2) - : Token (Numeric (drop 1 num)) (inc $ start t2) (end t2) - : rem - where - isSigned num = all (\ch -> ch == '-' || ch == '+') $ take 1 num -tweak h t = h : t - -formatTrivialError :: Set String -> Set String -> [Char] -formatTrivialError unexpectedTokens expectedTokens = - let unexpectedMsg = case Set.toList unexpectedTokens of - [] -> "I found something I didn't expect." - [x] -> - let article = case x of - (c : _) | c `elem` ("aeiou" :: String) -> "an" - _ -> "a" - in "I was surprised to find " <> article <> " " <> x <> " here." - xs -> "I was surprised to find these:\n\n* " <> List.intercalate "\n* " xs - expectedMsg = case Set.toList expectedTokens of - [] -> Nothing - xs -> Just $ "\nI was expecting one of these instead:\n\n* " <> List.intercalate "\n* " xs - in concat $ catMaybes [Just unexpectedMsg, expectedMsg] - -displayLexeme :: Lexeme -> String -displayLexeme = \case - Open o -> o - Semi True -> "end of stanza" - Semi False -> "semicolon" - Close -> "end of section" - Reserved r -> "'" <> r <> "'" - Textual t -> "\"" <> t <> "\"" - Character c -> "?" <> [c] - WordyId hq -> Text.unpack (HQ'.toTextWith Name.toText hq) - SymbolyId hq -> Text.unpack (HQ'.toTextWith Name.toText hq) - Blank b -> b - Numeric n -> n - Bytes _b -> "bytes literal" - Hash h -> Text.unpack (SH.toText h) - Err e -> show e - Doc _ -> "doc structure" - infixl 2 <+> (<+>) :: (Monoid a) => P a -> P a -> P a p1 <+> p2 = do a1 <- p1; a2 <- p2; pure (a1 <> a2) -type DocTree = Cofree (Doc.Top [Token Lexeme]) Ann - --- | The `Doc` lexer as documented on unison-lang.org -doc2 :: P [Token Lexeme] -doc2 = do - -- Ensure we're at a doc before we start consuming tokens - P.lookAhead (lit "{{") - openStart <- posP - -- Produce any layout tokens, such as closing the last open block or virtual semicolons - -- We don't use 'token' on "{{" directly because we don't want to duplicate layout - -- tokens if we do the rewrite hack for type-docs below. - beforeStartToks <- token' ignore (pure ()) - void $ lit "{{" - openEnd <- posP - CP.space - env0 <- S.get - -- Disable layout while parsing the doc block and reset the section number - (docTok, closeTok) <- local - ( \env -> - env - { inLayout = False, - parentSections = 0 : (parentSections env0) - } - ) - do - body <- docBody (lit "}}") - closeStart <- posP - lit "}}" - closeEnd <- posP - pure (Token (Doc body) openStart closeEnd, Token Close closeStart closeEnd) - -- Parse any layout tokens after the doc block, e.g. virtual semicolon - endToks <- token' ignore (pure ()) - -- Hack to allow anonymous doc blocks before type decls - -- {{ Some docs }} Foo.doc = {{ Some docs }} - -- ability Foo where => ability Foo where - -- - -- __FIXME__: This should be done _after_ parsing, not in lexing. - tn <- subsequentTypeName - pure $ - beforeStartToks <> case (tn) of - -- If we're followed by a type, we rewrite the doc block to be a named doc block. - Just (WordyId tname) - | isTopLevel -> - Token (WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment))) openStart openEnd - : Token (Open "=") openStart openEnd - : docTok - -- We need an extra 'Close' here because we added an extra Open above. - : closeTok - : endToks - where - isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 - _ -> docTok : endToks - where - -- DUPLICATED - wordyKw kw = separated wordySep (lit kw) - subsequentTypeName = P.lookAhead . P.optional $ do - let lit' s = lit s <* sp - let modifier = typeModifiersAlt (lit' . Text.unpack) - let typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) - _ <- optional modifier *> typeOrAbility' *> sp - Token name start stop <- tokenP identifierP - if Name.isSymboly (HQ'.toName name) - then P.customFailure (Token (InvalidSymbolyId (Text.unpack (HQ'.toTextWith Name.toText name))) start stop) - else pure (WordyId name) - ignore _ _ _ = [] - -- DUPLICATED - sp = P.try $ do - spaces <- P.takeWhile1P (Just "space") isSpace - close <- P.optional (P.lookAhead (lit "}}")) - case close of - Nothing -> guard $ ok spaces - Just _ -> pure () - pure spaces - where - ok s = length [() | '\n' <- s] < 2 - -- | Like `P.some`, but returns an actual `NonEmpty`. some' :: P a -> P (NonEmpty a) some' p = liftA2 (:|) p $ many p @@ -482,761 +174,12 @@ someTill' p end = liftA2 (:|) p $ P.manyTill p end sepBy1' :: P a -> P sep -> P (NonEmpty a) sepBy1' p sep = liftA2 (:|) p . many $ sep *> p --- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that --- Unison wraps `Doc` literals in `}}`). -docBody :: P end -> P (Doc.UntitledSection DocTree) -docBody docClose = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) - where - wordyKw kw = separated wordySep (lit kw) - sectionElem = section <|> fencedBlock <|> list <|> paragraph - paragraph = wrap' . Doc.Paragraph <$> spaced leaf - reserved word = List.isPrefixOf "}}" word || all (== '#') word - - wordy :: P end -> P (Doc.Leaf [Token Lexeme] void) - wordy closing = fmap Doc.Word . tokenP . P.try $ do - let end = - P.lookAhead $ - void docClose - <|> void (P.satisfy isSpace) - <|> void closing - word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end - guard (not $ reserved word || null word) - pure word - - leafy closing = groupy closing gs - where - gs = - link - <|> externalLink - <|> exampleInline - <|> expr - <|> boldOrItalicOrStrikethrough closing - <|> verbatim - <|> atDoc - <|> wordy closing - - leaf = leafy mzero - - atDoc = src <|> evalInline <|> signature <|> signatureInline - where - comma = lit "," <* CP.space - src = - src' Doc.Source "@source" - <|> src' Doc.FoldedSource "@foldedSource" - srcElem = - Doc.SourceElement - <$> (typeLink <|> termLink) - <*> ( fmap (fromMaybe []) . P.optional $ - (lit "@") *> (CP.space *> annotations) - ) - where - annotation = fmap Left (tokenP identifierP) <|> fmap Right expr <* CP.space - annotations = - P.some (Doc.EmbedAnnotation <$> annotation) - src' name atName = fmap name $ do - _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space - s <- sepBy1' srcElem comma - _ <- lit "}" - pure s - signature = fmap Doc.Signature $ do - _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space - s <- sepBy1' signatureLink comma - _ <- lit "}" - pure s - signatureInline = fmap Doc.SignatureInline $ do - _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space - s <- signatureLink - _ <- lit "}" - pure s - evalInline = fmap Doc.EvalInline $ do - _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space - let inlineEvalClose = [] <$ lit "}" - s <- lexemes' inlineEvalClose - pure s - - typeLink = fmap Doc.EmbedTypeLink $ do - _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tokenP identifierP <* CP.space - - termLink = - fmap Doc.EmbedTermLink $ - tokenP identifierP <* CP.space - - signatureLink = - fmap Doc.EmbedSignatureLink $ - tokenP identifierP <* CP.space - - groupy closing p = do - Token p _ _ <- tokenP p - after <- P.optional . P.try $ leafy closing - pure $ case after of - Nothing -> p - Just after -> - Doc.Group - . Doc.Join - $ p - :| pure after - - verbatim = - P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do - Token originalText start stop <- tokenP do - -- a single backtick followed by a non-backtick is treated as monospaced - let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) - -- also two or more ' followed by that number of closing ' - quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) - P.someTill P.anySingle (lit quotes) - let isMultiLine = line start /= line stop - if isMultiLine - then do - let trimmed = (trimAroundDelimiters originalText) - let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed - -- If it's a multi-line verbatim block we trim any whitespace representing - -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - pure . Doc.Verbatim $ - Doc.Word $ - Token txt start stop - else - pure . Doc.Code $ - Doc.Word $ - Token originalText start stop - - exampleInline = - P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ - fmap Doc.Example $ do - n <- P.try $ do - _ <- lit "`" - length <$> P.takeWhile1P (Just "backticks") (== '`') - let end :: P [Token Lexeme] = [] <$ lit (replicate (n + 1) '`') - ex <- CP.space *> lexemes' end - pure ex - - link = - P.label "link (examples: {type List}, {Nat.+})" $ - fmap Doc.Link $ - P.try $ - lit "{" *> (typeLink <|> termLink) <* lit "}" - - expr = - fmap Doc.Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ - lit "{{" *> lexemes' ([] <$ lit "}}") - - nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' - nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace - - -- Allows whitespace or a newline, but not more than two newlines in a row. - whitespaceWithoutParagraphBreak :: P () - whitespaceWithoutParagraphBreak = void do - void nonNewlineSpaces - optional newline >>= \case - Just _ -> void nonNewlineSpaces - Nothing -> pure () - - fencedBlock = - P.label "block eval (syntax: a fenced code block)" $ - evalUnison <|> exampleBlock <|> other - where - evalUnison = fmap (wrap' . Doc.Eval) $ do - -- commit after seeing that ``` is on its own line - fence <- P.try $ do - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) - fence <$ guard b - CP.space - *> lexemes' ([] <$ lit fence) - - exampleBlock = fmap (wrap' . Doc.ExampleBlock) $ do - void $ lit "@typecheck" <* CP.space - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - lexemes' $ [] <$ lit fence - - uncolumn column tabWidth s = - let skip col r | col < 1 = r - skip col s@('\t' : _) | col < tabWidth = s - skip col ('\t' : r) = skip (col - tabWidth) r - skip col (c : r) - | isSpace c && (not $ isControl c) = - skip (col - 1) r - skip _ s = s - in List.intercalate "\n" $ skip column <$> lines s - - other = fmap (uncurry $ wrapSimple2 Doc.CodeBlock) $ do - column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel - let tabWidth = toInteger . P.unPos $ P.defaultTabWidth - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - name <- - P.takeWhileP Nothing nonNewlineSpace - *> tokenP (P.takeWhile1P Nothing (not . isSpace)) - <* P.takeWhileP Nothing nonNewlineSpace - _ <- void CP.eol - verbatim <- - tokenP $ - uncolumn column tabWidth . trimAroundDelimiters - <$> P.someTill P.anySingle ([] <$ lit fence) - pure (name, verbatim) - - boldOrItalicOrStrikethrough closing = do - let start = - some (P.satisfy (== '*')) - <|> some (P.satisfy (== '_')) - <|> some - (P.satisfy (== '~')) - name s = - if take 1 s == "~" - then Doc.Strikethrough - else if take 1 s == "*" then Doc.Bold else Doc.Italic - end <- P.try $ do - end <- start - P.lookAhead (P.satisfy (not . isSpace)) - pure end - name end . wrap' . Doc.Paragraph - <$> someTill' - (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) - (lit end) - - externalLink = - P.label "hyperlink (example: [link name](https://destination.com))" $ - fmap (uncurry Doc.NamedLink) $ do - _ <- lit "[" - p <- leafies (void $ char ']') - _ <- lit "]" - _ <- lit "(" - target <- - fmap (Doc.Group . Doc.Join) $ - fmap pure link <|> some' (expr <|> wordy (char ')')) - _ <- lit ")" - pure (p, target) - - -- newline = P.optional (lit "\r") *> lit "\n" - - sp = P.try $ do - spaces <- P.takeWhile1P (Just "space") isSpace - close <- P.optional (P.lookAhead (lit "}}")) - case close of - Nothing -> guard $ ok spaces - Just _ -> pure () - pure spaces - where - ok s = length [() | '\n' <- s] < 2 - - spaced p = some' (p <* P.optional sp) - leafies close = wrap' . Doc.Paragraph <$> spaced (leafy close) - - list = bulletedList <|> numberedList - - bulletedList = wrap' . Doc.BulletedList <$> sepBy1' bullet listSep - numberedList = wrap' . Doc.NumberedList <$> sepBy1' numberedItem listSep - - listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) - - bulletedStart = P.try $ do - r <- listItemStart' $ [] <$ P.satisfy bulletChar - P.lookAhead (P.satisfy isSpace) - pure r - where - bulletChar ch = ch == '*' || ch == '-' || ch == '+' - - listItemStart' :: P a -> P (Int, a) - listItemStart' gutter = P.try $ do - nonNewlineSpaces - col <- column <$> posP - parentCol <- S.gets parentListColumn - guard (col > parentCol) - (col,) <$> gutter - - numberedStart = - listItemStart' $ P.try (tokenP $ LP.decimal <* lit ".") - - listItemParagraph = fmap (wrap' . Doc.Paragraph) $ do - col <- column <$> posP - some' (leaf <* sep col) - where - -- Trickiness here to support hard line breaks inside of - -- a bulleted list, so for instance this parses as expected: - -- - -- * uno dos - -- tres quatro - -- * alice bob - -- carol dave eve - sep col = do - _ <- nonNewlineSpaces - _ <- - P.optional . P.try $ - newline - *> nonNewlineSpaces - *> do - col2 <- column <$> posP - guard $ col2 >= col - (P.notFollowedBy $ void numberedStart <|> void bulletedStart) - pure () - - numberedItem = P.label msg $ do - (col, s) <- numberedStart - (s,) - <$> ( fmap (uncurry Doc.Column) $ do - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) - pure (p, subList) - ) - where - msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" - - bullet = fmap (uncurry Doc.Column) . P.label "bullet (examples: * item1, - item2)" $ do - (col, _) <- bulletedStart - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local - (\e -> e {parentListColumn = col}) - (P.optional $ listSep *> list) - pure (p, subList) - - newline = P.label "newline" $ lit "\n" <|> lit "\r\n" - - -- ## Section title - -- - -- A paragraph under this section. - -- Part of the same paragraph. Blanklines separate paragraphs. - -- - -- ### A subsection title - -- - -- A paragraph under this subsection. - - -- # A section title (not a subsection) - section :: P DocTree - section = fmap (wrap' . uncurry Doc.Section) $ do - ns <- S.gets parentSections - hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp - title <- paragraph <* CP.space - let m = length hashes + head ns - body <- - local (\env -> env {parentSections = (m : (tail ns))}) $ - P.many (sectionElem <* CP.space) - pure $ (title, body) - - wrap' :: Doc.Top [Token Lexeme] DocTree -> DocTree - wrap' doc = ann doc :< doc - - wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> Doc.Top [Token Lexeme] DocTree) -> a -> b -> DocTree - wrapSimple2 fn a b = ann a <> ann b :< fn a b - -lexemes' :: P [Token Lexeme] -> P [Token Lexeme] -lexemes' eof = - -- NB: `postLex` requires the token stream to start with an `Open`, otherwise it can’t create a `T`, so this adds one, - -- runs `postLex`, then removes it. - fmap (tail . postLex . (Token (Open "fake") mempty mempty :)) $ - local (\env -> env {inLayout = True, opening = Just "DUMMY"}) do - p <- lexemes eof - -- deals with a final "unclosed" block at the end of `p`) - unclosed <- takeWhile (("DUMMY" /=) . fst) . layout <$> S.get - let pos = end $ last p - pure $ p <> replicate (length unclosed) (Token Close pos pos) - --- | Consumes an entire Unison “module”. -lexemes :: P [Token Lexeme] -> P [Token Lexeme] -lexemes eof = - P.optional space >> do - hd <- join <$> P.manyTill toks (P.lookAhead eof) - tl <- eof - pure $ hd <> tl - where - toks :: P [Token Lexeme] - toks = - doc2 - <|> doc - <|> token numeric - <|> token character - <|> reserved - <|> token blank - <|> token identifierLexemeP - <|> (asum . map token) [semi, textual, hash] - - doc :: P [Token Lexeme] - doc = open <+> (CP.space *> fmap fixup body) <+> (close <* space) - where - open = token'' (\t _ _ -> t) $ tok (Open <$> lit "[:") - close = tok (Close <$ lit ":]") - at = lit "@" - -- this removes some trailing whitespace from final textual segment - fixup [] = [] - fixup (Token (Textual (reverse -> txt)) start stop : []) = - [Token (Textual txt') start stop] - where - txt' = reverse (dropWhile (\c -> isSpace c && not (c == '\n')) txt) - fixup (h : t) = h : fixup t - - body :: P [Token Lexeme] - body = txt <+> (atk <|> pure []) - where - ch = (":]" <$ lit "\\:]") <|> ("@" <$ lit "\\@") <|> (pure <$> P.anySingle) - txt = tok (Textual . join <$> P.manyTill ch (P.lookAhead sep)) - sep = void at <|> void close - ref = at *> (tok identifierLexemeP <|> docTyp) - atk = (ref <|> docTyp) <+> body - docTyp = do - _ <- lit "[" - typ <- tok (P.manyTill P.anySingle (P.lookAhead (lit "]"))) - _ <- lit "]" *> CP.space - t <- tok identifierLexemeP - pure $ (fmap Reserved <$> typ) <> t - - blank = - separated wordySep do - _ <- char '_' - seg <- P.optional wordyIdSegP - pure (Blank (maybe "" (Text.unpack . NameSegment.toUnescapedText) seg)) - - semi = char ';' $> Semi False - textual = Textual <$> quoted - quoted = quotedRaw <|> quotedSingleLine - quotedRaw = do - _ <- lit "\"\"\"" - n <- many (char '"') - _ <- optional (char '\n') -- initial newline is skipped - s <- P.manyTill P.anySingle (lit (replicate (length n + 3) '"')) - col0 <- column <$> posP - let col = col0 - (length n) - 3 -- this gets us first col of closing quotes - let leading = replicate (max 0 (col - 1)) ' ' - -- a last line that's equal to `leading` is ignored, since leading - -- spaces up to `col` are not considered part of the string - let tweak l = case reverse l of - last : rest - | col > 1 && last == leading -> reverse rest - | otherwise -> l - [] -> [] - pure $ case tweak (lines s) of - [] -> s - ls - | all (\l -> List.isPrefixOf leading l || all isSpace l) ls -> List.intercalate "\n" (drop (length leading) <$> ls) - | otherwise -> s - quotedSingleLine = char '"' *> P.manyTill (LP.charLiteral <|> sp) (char '"') - where - sp = lit "\\s" $> ' ' - character = Character <$> (char '?' *> (spEsc <|> LP.charLiteral)) - where - spEsc = P.try (char '\\' *> char 's' $> ' ') - - numeric = bytes <|> otherbase <|> float <|> intOrNat - where - intOrNat = P.try $ num <$> sign <*> LP.decimal - float = do - _ <- P.try (P.lookAhead (sign >> (LP.decimal :: P Int) >> (char '.' <|> char 'e' <|> char 'E'))) -- commit after this - start <- posP - sign <- fromMaybe "" <$> sign - base <- P.takeWhile1P (Just "base") isDigit - decimals <- - P.optional $ - let missingFractional = err start (MissingFractional $ base <> ".") - in liftA2 (<>) (lit ".") (P.takeWhile1P (Just "decimals") isDigit <|> missingFractional) - exp <- P.optional $ do - e <- map toLower <$> (lit "e" <|> lit "E") - sign <- fromMaybe "" <$> optional (lit "+" <|> lit "-") - let missingExp = err start (MissingExponent $ base <> fromMaybe "" decimals <> e <> sign) - exp <- P.takeWhile1P (Just "exponent") isDigit <|> missingExp - pure $ e <> sign <> exp - pure $ Numeric (sign <> base <> fromMaybe "" decimals <> fromMaybe "" exp) - - bytes = do - start <- posP - _ <- lit "0xs" - s <- map toLower <$> P.takeWhileP (Just "hexidecimal character") isAlphaNum - case Bytes.fromBase16 $ Bytes.fromWord8s (fromIntegral . ord <$> s) of - Left _ -> err start (InvalidBytesLiteral $ "0xs" <> s) - Right bs -> pure (Bytes bs) - otherbase = octal <|> hex - octal = do - start <- posP - commitAfter2 sign (lit "0o") $ \sign _ -> - fmap (num sign) LP.octal <|> err start InvalidOctalLiteral - hex = do - start <- posP - commitAfter2 sign (lit "0x") $ \sign _ -> - fmap (num sign) LP.hexadecimal <|> err start InvalidHexLiteral - - num :: Maybe String -> Integer -> Lexeme - num sign n = Numeric (fromMaybe "" sign <> show n) - sign = P.optional (lit "+" <|> lit "-") - - hash = Hash <$> P.try shortHashP - - reserved :: P [Token Lexeme] - reserved = - token' (\ts _ _ -> ts) $ - braces - <|> parens - <|> brackets - <|> commaSeparator - <|> delim - <|> delayOrForce - <|> keywords - <|> layoutKeywords - where - keywords = - -- yes "wordy" - just like a wordy keyword like "true", the literal "." (as in the dot in - -- "forall a. a -> a") is considered the keyword "." so long as it is either followed by EOF, a space, or some - -- non-wordy character (because ".foo" is a single identifier lexeme) - wordyKw "." - <|> symbolyKw ":" - <|> openKw "@rewrite" - <|> symbolyKw "@" - <|> symbolyKw "||" - <|> symbolyKw "|" - <|> symbolyKw "&&" - <|> wordyKw "true" - <|> wordyKw "false" - <|> wordyKw "use" - <|> wordyKw "forall" - <|> wordyKw "∀" - <|> wordyKw "termLink" - <|> wordyKw "typeLink" - - wordyKw s = separated wordySep (kw s) - symbolyKw s = separated (not . symbolyIdChar) (kw s) - - kw :: String -> P [Token Lexeme] - kw s = tokenP (lit s) <&> \token -> [Reserved <$> token] - - layoutKeywords :: P [Token Lexeme] - layoutKeywords = - ifElse - <|> withKw - <|> openKw "match" - <|> openKw "handle" - <|> typ - <|> arr - <|> rewriteArr - <|> eq - <|> openKw "cases" - <|> openKw "where" - <|> openKw "let" - <|> openKw "do" - where - ifElse = - openKw "if" - <|> closeKw' (Just "then") ["if"] (lit "then") - <|> closeKw' (Just "else") ["then"] (lit "else") - modKw = typeModifiersAlt (openKw1 wordySep . Text.unpack) - typeOrAbilityKw = typeOrAbilityAlt (openTypeKw1 . Text.unpack) - typ = modKw <|> typeOrAbilityKw - - withKw = do - [Token _ pos1 pos2] <- wordyKw "with" - env <- S.get - let l = layout env - case findClose ["handle", "match"] l of - Nothing -> err pos1 (CloseWithoutMatchingOpen msgOpen "'with'") - where - msgOpen = "'handle' or 'match'" - Just (withBlock, n) -> do - let b = withBlock <> "-with" - S.put (env {layout = drop n l, opening = Just b}) - let opens = [Token (Open "with") pos1 pos2] - pure $ replicate n (Token Close pos1 pos2) ++ opens - - -- In `structural/unique type` and `structural/unique ability`, - -- only the `structural` or `unique` opens a layout block, - -- and `ability` and `type` are just keywords. - openTypeKw1 t = do - b <- S.gets (topBlockName . layout) - case b of - Just mod | Set.member (Text.pack mod) typeModifiers -> wordyKw t - _ -> openKw1 wordySep t - - -- layout keyword which bumps the layout column by 1, rather than looking ahead - -- to the next token to determine the layout column - openKw1 :: (Char -> Bool) -> String -> P [Token Lexeme] - openKw1 sep kw = do - Token kw pos0 pos1 <- tokenP $ separated sep (lit kw) - S.modify (\env -> env {layout = (kw, column $ inc pos0) : layout env}) - pure [Token (Open kw) pos0 pos1] - - eq = do - [Token _ start end] <- symbolyKw "=" - env <- S.get - case topBlockName (layout env) of - -- '=' does not open a layout block if within a type declaration - Just t | t == "type" || Set.member (Text.pack t) typeModifiers -> pure [Token (Reserved "=") start end] - Just _ -> S.put (env {opening = Just "="}) >> pure [Token (Open "=") start end] - _ -> err start LayoutError - - rewriteArr = do - [Token _ start end] <- symbolyKw "==>" - env <- S.get - S.put (env {opening = Just "==>"}) >> pure [Token (Open "==>") start end] - - arr = do - [Token _ start end] <- symbolyKw "->" - env <- S.get - -- -> introduces a layout block if we're inside a `match with` or `cases` - case topBlockName (layout env) of - Just match | match `elem` matchWithBlocks -> do - S.put (env {opening = Just "->"}) - pure [Token (Open "->") start end] - _ -> pure [Token (Reserved "->") start end] - - -- a bit of lookahead here to reserve }} for closing a documentation block - braces = open "{" <|> close ["{"] p - where - p = do - l <- lit "}" - -- if we're within an existing {{ }} block, inLayout will be false - -- so we can actually allow }} to appear in normal code - inLayout <- S.gets inLayout - when (not inLayout) $ void $ P.lookAhead (P.satisfy (/= '}')) - pure l - matchWithBlocks = ["match-with", "cases"] - parens = open "(" <|> close ["("] (lit ")") - brackets = open "[" <|> close ["["] (lit "]") - -- `allowCommaToClose` determines if a comma should close inner blocks. - -- Currently there is a set of blocks where `,` is not treated specially - -- and it just emits a Reserved ",". There are currently only three: - -- `cases`, `match-with`, and `{` - allowCommaToClose match = not $ match `elem` ("{" : matchWithBlocks) - commaSeparator = do - env <- S.get - case topBlockName (layout env) of - Just match - | allowCommaToClose match -> - blockDelimiter ["[", "("] (lit ",") - _ -> fail "this comma is a pattern separator" - - delim = P.try $ do - ch <- P.satisfy (\ch -> ch /= ';' && Set.member ch delimiters) - pos <- posP - pure [Token (Reserved [ch]) pos (inc pos)] - - delayOrForce = separated ok $ do - token <- tokenP $ P.satisfy isDelayOrForce - pure [token <&> \op -> Reserved [op]] - where - ok c = isDelayOrForce c || isSpace c || isAlphaNum c || Set.member c delimiters || c == '\"' - --- | If it's a multi-line verbatim block we trim any whitespace representing --- indentation from the pretty-printer. --- --- E.g. --- --- @@ --- {{ --- # Heading --- ''' --- code --- indented --- ''' --- }} --- @@ --- --- Should lex to the text literal "code\n indented". --- --- If there's text in the literal that has LESS trailing whitespace than the --- opening delimiters, we don't trim it at all. E.g. --- --- @@ --- {{ --- # Heading --- ''' --- code --- ''' --- }} --- @@ --- --- Is parsed as " code". --- --- Trim the expected amount of whitespace from a text literal: --- >>> trimIndentFromVerbatimBlock 2 " code\n indented" --- "code\n indented" --- --- If the text literal has less leading whitespace than the opening delimiters, --- leave it as-is --- >>> trimIndentFromVerbatimBlock 2 "code\n indented" --- "code\n indented" -trimIndentFromVerbatimBlock :: Int -> String -> String -trimIndentFromVerbatimBlock leadingSpaces txt = fromMaybe txt $ do - List.intercalate "\n" <$> for (lines txt) \line -> do - -- If any 'stripPrefix' fails, we fail and return the unaltered text - case List.stripPrefix (replicate leadingSpaces ' ') line of - Just stripped -> Just stripped - Nothing -> - -- If it was a line with all white-space, just use an empty line, - -- this can happen easily in editors which trim trailing whitespace. - if all isSpace line - then Just "" - else Nothing - --- Trim leading/trailing whitespace from around delimiters, e.g. --- --- {{ --- '''___ <- whitespace here including newline --- text block --- 👇 or here --- __''' --- }} --- >>> trimAroundDelimiters " \n text block \n " --- " text block " --- --- Should leave leading and trailing line untouched if it contains non-whitespace, e.g.: --- --- ''' leading whitespace --- text block --- trailing whitespace: ''' --- >>> trimAroundDelimiters " leading whitespace\n text block \ntrailing whitespace: " --- " leading whitespace\n text block \ntrailing whitespace: " --- --- Should keep trailing newline if it's the only thing on the line, e.g.: --- --- ''' --- newline below --- --- ''' --- >>> trimAroundDelimiters "\nnewline below\n\n" --- "newline below\n\n" -trimAroundDelimiters :: String -> String -trimAroundDelimiters txt = - txt - & ( \s -> - List.breakOn "\n" s - & \case - (prefix, suffix) - | all isSpace prefix -> drop 1 suffix - | otherwise -> prefix <> suffix - ) - & ( \s -> - List.breakOnEnd "\n" s - & \case - (_prefix, "") -> s - (prefix, suffix) - | all isSpace suffix -> dropTrailingNewline prefix - | otherwise -> prefix <> suffix - ) - where - dropTrailingNewline = \case - [] -> [] - (x : xs) -> NonEmpty.init (x NonEmpty.:| xs) - separated :: (Char -> Bool) -> P a -> P a separated ok p = P.try $ p <* P.lookAhead (void (P.satisfy ok) <|> P.eof) -open :: String -> P [Token Lexeme] -open b = do - token <- tokenP $ lit b - env <- S.get - S.put (env {opening = Just b}) - pure [Open b <$ token] - -openKw :: String -> P [Token Lexeme] -openKw s = separated wordySep $ do - token <- tokenP $ lit s - env <- S.get - S.put (env {opening = Just s}) - pure [Open <$> token] - wordySep :: Char -> Bool wordySep c = isSpace c || not (wordyIdChar c) -tok :: P a -> P [Token a] -tok p = do - token <- tokenP p - pure [token] - -- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is -- symboly (comprised of only symbols) or wordy (comprised of only alphanums). -- @@ -1258,23 +201,6 @@ identifierP = do NameSegment.ReservedOperator s -> ReservedSymbolyId (Text.unpack s) NameSegment.ReservedWord s -> ReservedWordyId (Text.unpack s) --- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is --- symboly (comprised of only symbols) or wordy (comprised of only alphanums). --- --- Examples: --- --- foo --- .foo.++.doc --- `.`.`..` (This is a two-segment identifier without a leading dot: "." then "..") -identifierLexemeP :: P Lexeme -identifierLexemeP = identifierLexeme <$> identifierP - -identifierLexeme :: HQ'.HashQualified Name -> Lexeme -identifierLexeme name = - if Name.isSymboly (HQ'.toName name) - then SymbolyId name - else WordyId name - wordyIdSegP :: P NameSegment wordyIdSegP = PI.withParsecT (fmap (ReservedWordyId . Text.unpack)) NameSegment.wordyP @@ -1283,59 +209,11 @@ shortHashP :: P ShortHash shortHashP = PI.withParsecT (fmap (InvalidShortHash . Text.unpack)) ShortHash.shortHashP -blockDelimiter :: [String] -> P String -> P [Token Lexeme] -blockDelimiter open closeP = do - Token close pos1 pos2 <- tokenP closeP - env <- S.get - case findClose open (layout env) of - Nothing -> err pos1 (UnexpectedDelimiter (quote close)) - where - quote s = "'" <> s <> "'" - Just (_, n) -> do - S.put (env {layout = drop (n - 1) (layout env)}) - let delims = [Token (Reserved close) pos1 pos2] - pure $ replicate (n - 1) (Token Close pos1 pos2) ++ delims - -close :: [String] -> P String -> P [Token Lexeme] -close = close' Nothing - -closeKw' :: Maybe String -> [String] -> P String -> P [Token Lexeme] -closeKw' reopenBlockname open closeP = close' reopenBlockname open (separated wordySep closeP) - -close' :: Maybe String -> [String] -> P String -> P [Token Lexeme] -close' reopenBlockname open closeP = do - Token close pos1 pos2 <- tokenP closeP - env <- S.get - case findClose open (layout env) of - Nothing -> err pos1 (CloseWithoutMatchingOpen msgOpen (quote close)) - where - msgOpen = List.intercalate " or " (quote <$> open) - quote s = "'" <> s <> "'" - Just (_, n) -> do - S.put (env {layout = drop n (layout env), opening = reopenBlockname}) - let opens = maybe [] (const $ [Token (Open close) pos1 pos2]) reopenBlockname - pure $ replicate n (Token Close pos1 pos2) ++ opens - -findClose :: [String] -> Layout -> Maybe (String, Int) -findClose _ [] = Nothing -findClose s ((h, _) : tl) = if h `elem` s then Just (h, 1) else fmap (1 +) <$> findClose s tl - -notLayout :: Token Lexeme -> Bool -notLayout t = case payload t of - Close -> False - Semi _ -> False - Open _ -> False - _ -> True - -- `True` if the tokens are adjacent, with no space separating the two touches :: Token a -> Token b -> Bool touches (end -> t) (start -> t2) = line t == line t2 && column t == column t2 -top :: Layout -> Column -top [] = 1 -top ((_, h) : _) = h - -- todo: make Layout a NonEmpty topBlockName :: Layout -> Maybe BlockName topBlockName [] = Nothing @@ -1344,122 +222,6 @@ topBlockName ((name, _) : _) = Just name pop :: [a] -> [a] pop = drop 1 -topLeftCorner :: Pos -topLeftCorner = Pos 1 1 - -data BlockTree a - = Block - -- | The token that opens the block - a - -- | “Stanzas” of nested tokens - [[BlockTree a]] - -- | The closing token, if any - (Maybe a) - | Leaf a - deriving (Functor, Foldable, Traversable) - -headToken :: BlockTree a -> a -headToken (Block a _ _) = a -headToken (Leaf a) = a - -instance (Show a) => Show (BlockTree a) where - show (Leaf a) = show a - show (Block open mid close) = - show open - ++ "\n" - ++ indent " " (intercalateMap "\n" (intercalateMap " " show) mid) - ++ "\n" - ++ maybe "" show close - where - indent by s = by ++ (s >>= go by) - go by '\n' = '\n' : by - go _ c = [c] - -reorderTree :: ([[BlockTree a]] -> [[BlockTree a]]) -> BlockTree a -> BlockTree a -reorderTree f (Block open mid close) = Block open (f (fmap (reorderTree f) <$> mid)) close -reorderTree _ l = l - -tree :: [Token Lexeme] -> BlockTree (Token Lexeme) -tree toks = one toks const - where - one (open@(payload -> Open _) : ts) k = many (Block open . stanzas) [] ts k - one (t : ts) k = k (Leaf t) ts - one [] k = k lastErr [] - where - lastErr = Leaf case drop (length toks - 1) toks of - [] -> Token (Err LayoutError) topLeftCorner topLeftCorner - (t : _) -> t {payload = Err LayoutError} - - many open acc [] k = k (open (reverse acc) Nothing) [] - many open acc (t@(payload -> Close) : ts) k = k (open (reverse acc) $ pure t) ts - many open acc ts k = one ts $ \t ts -> many open (t : acc) ts k - -stanzas :: [BlockTree (Token Lexeme)] -> [[BlockTree (Token Lexeme)]] -stanzas = - toList - . foldr - ( \tok (curr :| stanzas) -> case tok of - Leaf (Token (Semi _) _ _) -> [tok] :| curr : stanzas - _ -> (tok : curr) :| stanzas - ) - ([] :| []) - --- Moves type and ability declarations to the front of the token stream --- and move `use` statements to the front of each block -reorder :: [[BlockTree (Token Lexeme)]] -> [[BlockTree (Token Lexeme)]] -reorder = foldr fixup [] . sortWith f - where - f [] = 3 :: Int - f (t0 : _) = case payload $ headToken t0 of - Open mod | Set.member (Text.pack mod) typeModifiers -> 1 - Open typOrA | Set.member (Text.pack typOrA) typeOrAbility -> 1 - Reserved "use" -> 0 - _ -> 3 :: Int - -- after reordering can end up with trailing semicolon at the end of - -- a block, which we remove with this pass - fixup stanza [] = case Lens.unsnoc stanza of - Nothing -> [] - -- remove any trailing `Semi` from the last non-empty stanza - Just (init, Leaf (Token (Semi _) _ _)) -> [init] - -- don’t touch other stanzas - Just (_, _) -> [stanza] - fixup stanza tail = stanza : tail - --- | This turns the lexeme stream into a tree, reordering some lexeme subsequences. -preParse :: [Token Lexeme] -> BlockTree (Token Lexeme) -preParse = reorderTree reorder . tree - --- | A few transformations that happen between lexing and parsing. --- --- All of these things should move out of the lexer, and be applied in the parse. -postLex :: [Token Lexeme] -> [Token Lexeme] -postLex = toList . preParse . foldr tweak [] - -isDelayOrForce :: Char -> Bool -isDelayOrForce op = op == '\'' || op == '!' - --- Mapping between characters and their escape codes. Use parse/showEscapeChar to convert. -escapeChars :: [(Char, Char)] -escapeChars = - [ ('0', '\0'), - ('a', '\a'), - ('b', '\b'), - ('f', '\f'), - ('n', '\n'), - ('r', '\r'), - ('t', '\t'), - ('v', '\v'), - ('s', ' '), - ('\'', '\''), - ('"', '"'), - ('\\', '\\') - ] - --- Inverse of parseEscapeChar; map a character to its escaped version: -showEscapeChar :: Char -> Maybe Char -showEscapeChar c = - Map.lookup c (Map.fromList [(x, y) | (y, x) <- escapeChars]) - typeOrAbilityAlt :: (Alternative f) => (Text -> f a) -> f a typeOrAbilityAlt f = asum $ map f (toList typeOrAbility) @@ -1471,28 +233,6 @@ typeModifiersAlt f = inc :: Pos -> Pos inc (Pos line col) = Pos line (col + 1) -debugFilePreParse :: FilePath -> IO () -debugFilePreParse file = putStrLn . debugPreParse . preParse . lexer file . Text.unpack =<< readUtf8 file - -debugPreParse :: BlockTree (Token Lexeme) -> String -debugPreParse (Leaf (Token (Err (UnexpectedTokens msg)) start end)) = - (if start == end then msg1 else msg2) <> ":\n" <> msg - where - msg1 = "Error on line " <> show (line start) <> ", column " <> show (column start) - msg2 = - "Error on line " - <> show (line start) - <> ", column " - <> show (column start) - <> " - line " - <> show (line end) - <> ", column " - <> show (column end) -debugPreParse ts = show $ payload <$> ts - -debugPreParse' :: String -> String -debugPreParse' = debugPreParse . preParse . lexer "debugPreParse" - instance EP.ShowErrorComponent (Token Err) where showErrorComponent (Token err _ _) = go err where @@ -1504,35 +244,3 @@ instance EP.ShowErrorComponent (Token Err) where TextLiteralMissingClosingQuote s -> "This text literal missing a closing quote: " <> excerpt s e -> show e excerpt s = if length s < 15 then s else take 15 s <> "..." - -instance P.VisualStream [Token Lexeme] where - showTokens _ xs = - join . Nel.toList . S.evalState (traverse go xs) . end $ Nel.head xs - where - go :: Token Lexeme -> S.State Pos String - go tok = do - prev <- S.get - S.put $ end tok - pure $ pad prev (start tok) ++ pretty (payload tok) - pretty (Open s) = s - pretty (Reserved w) = w - pretty (Textual t) = '"' : t ++ ['"'] - pretty (Character c) = - case showEscapeChar c of - Just c -> "?\\" ++ [c] - Nothing -> '?' : [c] - pretty (WordyId n) = Text.unpack (HQ'.toText n) - pretty (SymbolyId n) = Text.unpack (HQ'.toText n) - pretty (Blank s) = "_" ++ s - pretty (Numeric n) = n - pretty (Hash sh) = show sh - pretty (Err e) = show e - pretty (Bytes bs) = "0xs" <> show bs - pretty Close = "" - pretty (Semi True) = "" - pretty (Semi False) = ";" - pretty (Doc d) = show d - pad (Pos line1 col1) (Pos line2 col2) = - if line1 == line2 - then replicate (col2 - col1) ' ' - else replicate (line2 - line1) '\n' ++ replicate col2 ' ' diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs new file mode 100644 index 0000000000..dcaf9ca6d3 --- /dev/null +++ b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs @@ -0,0 +1,910 @@ +{-# LANGUAGE TemplateHaskell #-} +{-# OPTIONS_GHC -Wno-orphans #-} + +module Unison.Syntax.Lexer.Unison + ( Token (..), + Line, + Column, + Err (..), + Pos (..), + Lexeme (..), + lexer, + preParse, + escapeChars, + debugFilePreParse, + debugPreParse, + debugPreParse', + showEscapeChar, + touches, + + -- * Character classifiers + wordyIdChar, + wordyIdStartChar, + symbolyIdChar, + + -- * Error formatting + formatTrivialError, + displayLexeme, + ) +where + +import Control.Lens qualified as Lens +import Control.Monad.State qualified as S +import Data.Char (isAlphaNum, isDigit, isSpace, ord, toLower) +import Data.Foldable qualified as Foldable +import Data.List qualified as List +import Data.List.NonEmpty (NonEmpty ((:|))) +import Data.List.NonEmpty qualified as Nel +import Data.Map.Strict qualified as Map +import Data.Set qualified as Set +import Data.Text qualified as Text +import GHC.Exts (sortWith) +import Text.Megaparsec qualified as P +import Text.Megaparsec.Char (char) +import Text.Megaparsec.Char qualified as CP +import Text.Megaparsec.Char.Lexer qualified as LP +import Text.Megaparsec.Error qualified as EP +import Unison.HashQualifiedPrime qualified as HQ' +import Unison.Name (Name) +import Unison.Name qualified as Name +import Unison.NameSegment qualified as NameSegment (docSegment) +import Unison.NameSegment.Internal qualified as NameSegment +import Unison.Prelude +import Unison.ShortHash (ShortHash) +import Unison.ShortHash qualified as SH +import Unison.Syntax.HashQualifiedPrime qualified as HQ' (toText) +import Unison.Syntax.Lexer +import Unison.Syntax.Lexer.Token (posP, tokenP) +import Unison.Syntax.Name qualified as Name (isSymboly, toText, unsafeParseText) +import Unison.Syntax.Parser.Doc qualified as Doc +import Unison.Syntax.Parser.Doc.Data qualified as Doc +import Unison.Syntax.ReservedWords (delimiters, typeModifiers, typeOrAbility) +import Unison.Util.Bytes qualified as Bytes +import Unison.Util.Monoid (intercalateMap) + +-- Design principle: +-- `[Lexeme]` should be sufficient information for parsing without +-- further knowledge of spacing or indentation levels +-- any knowledge of comments +data Lexeme + = Open String -- start of a block + | Semi IsVirtual -- separator between elements of a block + | Close -- end of a block + | Reserved String -- reserved tokens such as `{`, `(`, `type`, `of`, etc + | Textual String -- text literals, `"foo bar"` + | Character Char -- character literals, `?X` + | WordyId (HQ'.HashQualified Name) -- a (non-infix) identifier. invariant: last segment is wordy + | SymbolyId (HQ'.HashQualified Name) -- an infix identifier. invariant: last segment is symboly + | Blank String -- a typed hole or placeholder + | Numeric String -- numeric literals, left unparsed + | Bytes Bytes.Bytes -- bytes literals + | Hash ShortHash -- hash literals + | Err Err + | Doc (Doc.UntitledSection (Doc.Tree [Token Lexeme])) + deriving stock (Eq, Show, Ord) + +type IsVirtual = Bool -- is it a virtual semi or an actual semi? + +token :: P Lexeme -> P [Token Lexeme] +token = token' (\a start end -> [Token a start end]) + +-- Token parser: strips trailing whitespace and comments after a +-- successful parse, and also takes care of emitting layout tokens +-- (such as virtual semicolons and closing tokens). +token' :: (a -> Pos -> Pos -> [Token Lexeme]) -> P a -> P [Token Lexeme] +token' tok p = LP.lexeme space (token'' tok p) + +-- Token parser implementation which leaves trailing whitespace and comments +-- but does emit layout tokens such as virtual semicolons and closing tokens. +token'' :: (a -> Pos -> Pos -> [Token Lexeme]) -> P a -> P [Token Lexeme] +token'' tok p = do + start <- posP + -- We save the current state so we can backtrack the state if `p` fails. + env <- S.get + layoutToks <- case opening env of + -- If we're opening a block named b, we push (b, currentColumn) onto + -- the layout stack. Example: + -- + -- blah = cases + -- {- A comment -} + -- -- A one-line comment + -- 0 -> "hi" + -- 1 -> "bye" + -- + -- After the `cases` token, the state will be opening = Just "cases", + -- meaning the parser is searching for the next non-whitespace/comment + -- character to determine the leftmost column of the `cases` block. + -- That will be the column of the `0`. + Just blockname -> + -- special case - handling of empty blocks, as in: + -- foo = + -- bar = 42 + if blockname == "=" && column start <= top l && not (null l) + then do + S.put (env {layout = (blockname, column start + 1) : l, opening = Nothing}) + pops start + else [] <$ S.put (env {layout = layout', opening = Nothing}) + where + layout' = (blockname, column start) : l + l = layout env + -- If we're not opening a block, we potentially pop from + -- the layout stack and/or emit virtual semicolons. + Nothing -> if inLayout env then pops start else pure [] + beforeTokenPos <- posP + a <- p <|> (S.put env >> fail "resetting state") + endPos <- posP + pure $ layoutToks ++ tok a beforeTokenPos endPos + where + pops :: Pos -> P [Token Lexeme] + pops p = do + env <- S.get + let l = layout env + if top l == column p && topContainsVirtualSemis l + then pure [Token (Semi True) p p] + else + if column p > top l || topHasClosePair l + then pure [] + else + if column p < top l + then S.put (env {layout = pop l}) >> ((Token Close p p :) <$> pops p) + else error "impossible" + + -- don't emit virtual semis in (, {, or [ blocks + topContainsVirtualSemis :: Layout -> Bool + topContainsVirtualSemis = \case + [] -> False + ((name, _) : _) -> name /= "(" && name /= "{" && name /= "[" + + topHasClosePair :: Layout -> Bool + topHasClosePair [] = False + topHasClosePair ((name, _) : _) = + name `elem` ["DUMMY", "{", "(", "[", "handle", "match", "if", "then"] + +showErrorFancy :: (P.ShowErrorComponent e) => P.ErrorFancy e -> String +showErrorFancy = \case + P.ErrorFail msg -> msg + P.ErrorIndentation ord ref actual -> + "incorrect indentation (got " + <> show (P.unPos actual) + <> ", should be " + <> p + <> show (P.unPos ref) + <> ")" + where + p = case ord of + LT -> "less than " + EQ -> "equal to " + GT -> "greater than " + P.ErrorCustom a -> P.showErrorComponent a + +lexer :: String -> String -> [Token Lexeme] +lexer scope rem = + case flip S.evalState env0 $ P.runParserT (lexemes eof) scope rem of + Left e -> + let errsWithSourcePos = + fst $ + P.attachSourcePos + P.errorOffset + (toList (P.bundleErrors e)) + (P.bundlePosState e) + errorToTokens :: (EP.ParseError String (Token Err), P.SourcePos) -> [Token Lexeme] + errorToTokens (err, top) = case err of + P.FancyError _ (customErrs -> es) | not (null es) -> es + P.FancyError _errOffset es -> + let msg = intercalateMap "\n" showErrorFancy es + in [Token (Err (UnexpectedTokens msg)) (toPos top) (toPos top)] + P.TrivialError _errOffset mayUnexpectedTokens expectedTokens -> + let unexpectedStr :: Set String + unexpectedStr = + mayUnexpectedTokens + & fmap errorItemToString + & maybeToList + & Set.fromList + errorLength :: Int + errorLength = case Set.toList unexpectedStr of + [] -> 0 + (x : _) -> length x + expectedStr :: Set String + expectedStr = + expectedTokens + & Set.map errorItemToString + err = UnexpectedTokens $ formatTrivialError unexpectedStr expectedStr + startPos = toPos top + -- This is just an attempt to highlight errors better in source excerpts. + -- It may not work in all cases, but should generally provide a better experience. + endPos = startPos & \(Pos l c) -> Pos l (c + errorLength) + in [Token (Err err) startPos endPos] + in errsWithSourcePos >>= errorToTokens + Right ts -> postLex $ Token (Open scope) topLeftCorner topLeftCorner : ts + where + eof :: P [Token Lexeme] + eof = P.try do + p <- P.eof >> posP + n <- maybe 0 (const 1) <$> S.gets opening + l <- S.gets layout + pure $ replicate (length l + n) (Token Close p p) + errorItemToString :: EP.ErrorItem Char -> String + errorItemToString = \case + (P.Tokens ts) -> Foldable.toList ts + (P.Label ts) -> Foldable.toList ts + (P.EndOfInput) -> "end of input" + customErrs es = [Err <$> e | P.ErrorCustom e <- toList es] + toPos (P.SourcePos _ line col) = Pos (P.unPos line) (P.unPos col) + env0 = ParsingEnv [] (Just scope) True [0] 0 + +-- | hacky postprocessing pass to do some cleanup of stuff that's annoying to +-- fix without adding more state to the lexer: +-- - 1+1 lexes as [1, +1], convert this to [1, +, 1] +-- - when a semi followed by a virtual semi, drop the virtual, lets you +-- write +-- foo x = action1; +-- 2 +-- - semi immediately after first Open is ignored +tweak :: (Token Lexeme) -> [Token Lexeme] -> [Token Lexeme] +tweak h@(Token (Semi False) _ _) (Token (Semi True) _ _ : t) = h : t +-- __NB__: This case only exists to guard against the following one +tweak h@(Token (Reserved _) _ _) t = h : t +tweak t1 (t2@(Token (Numeric num) _ _) : rem) + | notLayout t1 && touches t1 t2 && isSigned num = + t1 + : Token + (SymbolyId (HQ'.fromName (Name.unsafeParseText (Text.pack (take 1 num))))) + (start t2) + (inc $ start t2) + : Token (Numeric (drop 1 num)) (inc $ start t2) (end t2) + : rem + where + isSigned num = all (\ch -> ch == '-' || ch == '+') $ take 1 num +tweak h t = h : t + +formatTrivialError :: Set String -> Set String -> [Char] +formatTrivialError unexpectedTokens expectedTokens = + let unexpectedMsg = case Set.toList unexpectedTokens of + [] -> "I found something I didn't expect." + [x] -> + let article = case x of + (c : _) | c `elem` ("aeiou" :: String) -> "an" + _ -> "a" + in "I was surprised to find " <> article <> " " <> x <> " here." + xs -> "I was surprised to find these:\n\n* " <> List.intercalate "\n* " xs + expectedMsg = case Set.toList expectedTokens of + [] -> Nothing + xs -> Just $ "\nI was expecting one of these instead:\n\n* " <> List.intercalate "\n* " xs + in concat $ catMaybes [Just unexpectedMsg, expectedMsg] + +displayLexeme :: Lexeme -> String +displayLexeme = \case + Open o -> o + Semi True -> "end of stanza" + Semi False -> "semicolon" + Close -> "end of section" + Reserved r -> "'" <> r <> "'" + Textual t -> "\"" <> t <> "\"" + Character c -> "?" <> [c] + WordyId hq -> Text.unpack (HQ'.toTextWith Name.toText hq) + SymbolyId hq -> Text.unpack (HQ'.toTextWith Name.toText hq) + Blank b -> b + Numeric n -> n + Bytes _b -> "bytes literal" + Hash h -> Text.unpack (SH.toText h) + Err e -> show e + Doc _ -> "doc structure" + +-- | The `Doc` lexer as documented on unison-lang.org +doc2 :: P [Token Lexeme] +doc2 = do + -- Ensure we're at a doc before we start consuming tokens + P.lookAhead (lit "{{") + openStart <- posP + -- Produce any layout tokens, such as closing the last open block or virtual semicolons + -- We don't use 'token' on "{{" directly because we don't want to duplicate layout + -- tokens if we do the rewrite hack for type-docs below. + beforeStartToks <- token' ignore (pure ()) + void $ lit "{{" + openEnd <- posP + CP.space + env0 <- S.get + -- Disable layout while parsing the doc block and reset the section number + (docTok, closeTok) <- local + ( \env -> + env + { inLayout = False, + parentSections = 0 : (parentSections env0) + } + ) + do + body <- Doc.untitledSection lexemes' . P.lookAhead $ () <$ lit "}}" + closeStart <- posP + lit "}}" + closeEnd <- posP + pure (Token (Doc body) openStart closeEnd, Token Close closeStart closeEnd) + -- Parse any layout tokens after the doc block, e.g. virtual semicolon + endToks <- token' ignore (pure ()) + -- Hack to allow anonymous doc blocks before type decls + -- {{ Some docs }} Foo.doc = {{ Some docs }} + -- ability Foo where => ability Foo where + -- + -- __FIXME__: This should be done _after_ parsing, not in lexing. + tn <- subsequentTypeName + pure $ + beforeStartToks <> case (tn) of + -- If we're followed by a type, we rewrite the doc block to be a named doc block. + Just (WordyId tname) + | isTopLevel -> + Token (WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment))) openStart openEnd + : Token (Open "=") openStart openEnd + : docTok + -- We need an extra 'Close' here because we added an extra Open above. + : closeTok + : endToks + where + isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 + _ -> docTok : endToks + where + -- DUPLICATED + wordyKw kw = separated wordySep (lit kw) + subsequentTypeName = P.lookAhead . P.optional $ do + let lit' s = lit s <* sp + let modifier = typeModifiersAlt (lit' . Text.unpack) + let typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) + _ <- optional modifier *> typeOrAbility' *> sp + Token name start stop <- tokenP identifierP + if Name.isSymboly (HQ'.toName name) + then P.customFailure (Token (InvalidSymbolyId (Text.unpack (HQ'.toTextWith Name.toText name))) start stop) + else pure (WordyId name) + ignore _ _ _ = [] + -- DUPLICATED + sp = P.try $ do + spaces <- P.takeWhile1P (Just "space") isSpace + close <- P.optional (P.lookAhead (lit "}}")) + case close of + Nothing -> guard $ ok spaces + Just _ -> pure () + pure spaces + where + ok s = length [() | '\n' <- s] < 2 + +lexemes' :: P () -> P [Token Lexeme] +lexemes' eof = + -- NB: `postLex` requires the token stream to start with an `Open`, otherwise it can’t create a `T`, so this adds one, + -- runs `postLex`, then removes it. + fmap (tail . postLex . (Token (Open "fake") mempty mempty :)) $ + local (\env -> env {inLayout = True, opening = Just "DUMMY"}) do + p <- lexemes $ [] <$ eof + -- deals with a final "unclosed" block at the end of `p`) + unclosed <- takeWhile (("DUMMY" /=) . fst) . layout <$> S.get + let pos = end $ last p + pure $ p <> replicate (length unclosed) (Token Close pos pos) + +-- | Consumes an entire Unison “module”. +lexemes :: P [Token Lexeme] -> P [Token Lexeme] +lexemes eof = + P.optional space >> do + hd <- join <$> P.manyTill toks (P.lookAhead eof) + tl <- eof + pure $ hd <> tl + where + toks :: P [Token Lexeme] + toks = + doc2 + <|> doc + <|> token numeric + <|> token character + <|> reserved + <|> token blank + <|> token identifierLexemeP + <|> (asum . map token) [semi, textual, hash] + + doc :: P [Token Lexeme] + doc = open <+> (CP.space *> fmap fixup body) <+> (close <* space) + where + open = token'' (\t _ _ -> t) $ tok (Open <$> lit "[:") + close = tok (Close <$ lit ":]") + at = lit "@" + -- this removes some trailing whitespace from final textual segment + fixup [] = [] + fixup (Token (Textual (reverse -> txt)) start stop : []) = + [Token (Textual txt') start stop] + where + txt' = reverse (dropWhile (\c -> isSpace c && not (c == '\n')) txt) + fixup (h : t) = h : fixup t + + body :: P [Token Lexeme] + body = txt <+> (atk <|> pure []) + where + ch = (":]" <$ lit "\\:]") <|> ("@" <$ lit "\\@") <|> (pure <$> P.anySingle) + txt = tok (Textual . join <$> P.manyTill ch (P.lookAhead sep)) + sep = void at <|> void close + ref = at *> (tok identifierLexemeP <|> docTyp) + atk = (ref <|> docTyp) <+> body + docTyp = do + _ <- lit "[" + typ <- tok (P.manyTill P.anySingle (P.lookAhead (lit "]"))) + _ <- lit "]" *> CP.space + t <- tok identifierLexemeP + pure $ (fmap Reserved <$> typ) <> t + + blank = + separated wordySep do + _ <- char '_' + seg <- P.optional wordyIdSegP + pure (Blank (maybe "" (Text.unpack . NameSegment.toUnescapedText) seg)) + + semi = char ';' $> Semi False + textual = Textual <$> quoted + quoted = quotedRaw <|> quotedSingleLine + quotedRaw = do + _ <- lit "\"\"\"" + n <- many (char '"') + _ <- optional (char '\n') -- initial newline is skipped + s <- P.manyTill P.anySingle (lit (replicate (length n + 3) '"')) + col0 <- column <$> posP + let col = col0 - (length n) - 3 -- this gets us first col of closing quotes + let leading = replicate (max 0 (col - 1)) ' ' + -- a last line that's equal to `leading` is ignored, since leading + -- spaces up to `col` are not considered part of the string + let tweak l = case reverse l of + last : rest + | col > 1 && last == leading -> reverse rest + | otherwise -> l + [] -> [] + pure $ case tweak (lines s) of + [] -> s + ls + | all (\l -> List.isPrefixOf leading l || all isSpace l) ls -> List.intercalate "\n" (drop (length leading) <$> ls) + | otherwise -> s + quotedSingleLine = char '"' *> P.manyTill (LP.charLiteral <|> sp) (char '"') + where + sp = lit "\\s" $> ' ' + character = Character <$> (char '?' *> (spEsc <|> LP.charLiteral)) + where + spEsc = P.try (char '\\' *> char 's' $> ' ') + + numeric = bytes <|> otherbase <|> float <|> intOrNat + where + intOrNat = P.try $ num <$> sign <*> LP.decimal + float = do + _ <- P.try (P.lookAhead (sign >> (LP.decimal :: P Int) >> (char '.' <|> char 'e' <|> char 'E'))) -- commit after this + start <- posP + sign <- fromMaybe "" <$> sign + base <- P.takeWhile1P (Just "base") isDigit + decimals <- + P.optional $ + let missingFractional = err start (MissingFractional $ base <> ".") + in liftA2 (<>) (lit ".") (P.takeWhile1P (Just "decimals") isDigit <|> missingFractional) + exp <- P.optional $ do + e <- map toLower <$> (lit "e" <|> lit "E") + sign <- fromMaybe "" <$> optional (lit "+" <|> lit "-") + let missingExp = err start (MissingExponent $ base <> fromMaybe "" decimals <> e <> sign) + exp <- P.takeWhile1P (Just "exponent") isDigit <|> missingExp + pure $ e <> sign <> exp + pure $ Numeric (sign <> base <> fromMaybe "" decimals <> fromMaybe "" exp) + + bytes = do + start <- posP + _ <- lit "0xs" + s <- map toLower <$> P.takeWhileP (Just "hexidecimal character") isAlphaNum + case Bytes.fromBase16 $ Bytes.fromWord8s (fromIntegral . ord <$> s) of + Left _ -> err start (InvalidBytesLiteral $ "0xs" <> s) + Right bs -> pure (Bytes bs) + otherbase = octal <|> hex + octal = do + start <- posP + commitAfter2 sign (lit "0o") $ \sign _ -> + fmap (num sign) LP.octal <|> err start InvalidOctalLiteral + hex = do + start <- posP + commitAfter2 sign (lit "0x") $ \sign _ -> + fmap (num sign) LP.hexadecimal <|> err start InvalidHexLiteral + + num :: Maybe String -> Integer -> Lexeme + num sign n = Numeric (fromMaybe "" sign <> show n) + sign = P.optional (lit "+" <|> lit "-") + + hash = Hash <$> P.try shortHashP + + reserved :: P [Token Lexeme] + reserved = + token' (\ts _ _ -> ts) $ + braces + <|> parens + <|> brackets + <|> commaSeparator + <|> delim + <|> delayOrForce + <|> keywords + <|> layoutKeywords + where + keywords = + -- yes "wordy" - just like a wordy keyword like "true", the literal "." (as in the dot in + -- "forall a. a -> a") is considered the keyword "." so long as it is either followed by EOF, a space, or some + -- non-wordy character (because ".foo" is a single identifier lexeme) + wordyKw "." + <|> symbolyKw ":" + <|> openKw "@rewrite" + <|> symbolyKw "@" + <|> symbolyKw "||" + <|> symbolyKw "|" + <|> symbolyKw "&&" + <|> wordyKw "true" + <|> wordyKw "false" + <|> wordyKw "use" + <|> wordyKw "forall" + <|> wordyKw "∀" + <|> wordyKw "termLink" + <|> wordyKw "typeLink" + + wordyKw s = separated wordySep (kw s) + symbolyKw s = separated (not . symbolyIdChar) (kw s) + + kw :: String -> P [Token Lexeme] + kw s = tokenP (lit s) <&> \token -> [Reserved <$> token] + + layoutKeywords :: P [Token Lexeme] + layoutKeywords = + ifElse + <|> withKw + <|> openKw "match" + <|> openKw "handle" + <|> typ + <|> arr + <|> rewriteArr + <|> eq + <|> openKw "cases" + <|> openKw "where" + <|> openKw "let" + <|> openKw "do" + where + ifElse = + openKw "if" + <|> closeKw' (Just "then") ["if"] (lit "then") + <|> closeKw' (Just "else") ["then"] (lit "else") + modKw = typeModifiersAlt (openKw1 wordySep . Text.unpack) + typeOrAbilityKw = typeOrAbilityAlt (openTypeKw1 . Text.unpack) + typ = modKw <|> typeOrAbilityKw + + withKw = do + [Token _ pos1 pos2] <- wordyKw "with" + env <- S.get + let l = layout env + case findClose ["handle", "match"] l of + Nothing -> err pos1 (CloseWithoutMatchingOpen msgOpen "'with'") + where + msgOpen = "'handle' or 'match'" + Just (withBlock, n) -> do + let b = withBlock <> "-with" + S.put (env {layout = drop n l, opening = Just b}) + let opens = [Token (Open "with") pos1 pos2] + pure $ replicate n (Token Close pos1 pos2) ++ opens + + -- In `structural/unique type` and `structural/unique ability`, + -- only the `structural` or `unique` opens a layout block, + -- and `ability` and `type` are just keywords. + openTypeKw1 t = do + b <- S.gets (topBlockName . layout) + case b of + Just mod | Set.member (Text.pack mod) typeModifiers -> wordyKw t + _ -> openKw1 wordySep t + + -- layout keyword which bumps the layout column by 1, rather than looking ahead + -- to the next token to determine the layout column + openKw1 :: (Char -> Bool) -> String -> P [Token Lexeme] + openKw1 sep kw = do + Token kw pos0 pos1 <- tokenP $ separated sep (lit kw) + S.modify (\env -> env {layout = (kw, column $ inc pos0) : layout env}) + pure [Token (Open kw) pos0 pos1] + + eq = do + [Token _ start end] <- symbolyKw "=" + env <- S.get + case topBlockName (layout env) of + -- '=' does not open a layout block if within a type declaration + Just t | t == "type" || Set.member (Text.pack t) typeModifiers -> pure [Token (Reserved "=") start end] + Just _ -> S.put (env {opening = Just "="}) >> pure [Token (Open "=") start end] + _ -> err start LayoutError + + rewriteArr = do + [Token _ start end] <- symbolyKw "==>" + env <- S.get + S.put (env {opening = Just "==>"}) >> pure [Token (Open "==>") start end] + + arr = do + [Token _ start end] <- symbolyKw "->" + env <- S.get + -- -> introduces a layout block if we're inside a `match with` or `cases` + case topBlockName (layout env) of + Just match | match `elem` matchWithBlocks -> do + S.put (env {opening = Just "->"}) + pure [Token (Open "->") start end] + _ -> pure [Token (Reserved "->") start end] + + -- a bit of lookahead here to reserve }} for closing a documentation block + braces = open "{" <|> close ["{"] p + where + p = do + l <- lit "}" + -- if we're within an existing {{ }} block, inLayout will be false + -- so we can actually allow }} to appear in normal code + inLayout <- S.gets inLayout + when (not inLayout) $ void $ P.lookAhead (P.satisfy (/= '}')) + pure l + matchWithBlocks = ["match-with", "cases"] + parens = open "(" <|> close ["("] (lit ")") + brackets = open "[" <|> close ["["] (lit "]") + -- `allowCommaToClose` determines if a comma should close inner blocks. + -- Currently there is a set of blocks where `,` is not treated specially + -- and it just emits a Reserved ",". There are currently only three: + -- `cases`, `match-with`, and `{` + allowCommaToClose match = not $ match `elem` ("{" : matchWithBlocks) + commaSeparator = do + env <- S.get + case topBlockName (layout env) of + Just match + | allowCommaToClose match -> + blockDelimiter ["[", "("] (lit ",") + _ -> fail "this comma is a pattern separator" + + delim = P.try $ do + ch <- P.satisfy (\ch -> ch /= ';' && Set.member ch delimiters) + pos <- posP + pure [Token (Reserved [ch]) pos (inc pos)] + + delayOrForce = separated ok $ do + token <- tokenP $ P.satisfy isDelayOrForce + pure [token <&> \op -> Reserved [op]] + where + ok c = isDelayOrForce c || isSpace c || isAlphaNum c || Set.member c delimiters || c == '\"' + +open :: String -> P [Token Lexeme] +open b = do + token <- tokenP $ lit b + env <- S.get + S.put (env {opening = Just b}) + pure [Open b <$ token] + +openKw :: String -> P [Token Lexeme] +openKw s = separated wordySep $ do + token <- tokenP $ lit s + env <- S.get + S.put (env {opening = Just s}) + pure [Open <$> token] + +tok :: P a -> P [Token a] +tok p = do + token <- tokenP p + pure [token] + +-- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is +-- symboly (comprised of only symbols) or wordy (comprised of only alphanums). +-- +-- Examples: +-- +-- foo +-- .foo.++.doc +-- `.`.`..` (This is a two-segment identifier without a leading dot: "." then "..") +identifierLexemeP :: P Lexeme +identifierLexemeP = identifierLexeme <$> identifierP + +identifierLexeme :: HQ'.HashQualified Name -> Lexeme +identifierLexeme name = + if Name.isSymboly (HQ'.toName name) + then SymbolyId name + else WordyId name + +blockDelimiter :: [String] -> P String -> P [Token Lexeme] +blockDelimiter open closeP = do + Token close pos1 pos2 <- tokenP closeP + env <- S.get + case findClose open (layout env) of + Nothing -> err pos1 (UnexpectedDelimiter (quote close)) + where + quote s = "'" <> s <> "'" + Just (_, n) -> do + S.put (env {layout = drop (n - 1) (layout env)}) + let delims = [Token (Reserved close) pos1 pos2] + pure $ replicate (n - 1) (Token Close pos1 pos2) ++ delims + +close :: [String] -> P String -> P [Token Lexeme] +close = close' Nothing + +closeKw' :: Maybe String -> [String] -> P String -> P [Token Lexeme] +closeKw' reopenBlockname open closeP = close' reopenBlockname open (separated wordySep closeP) + +close' :: Maybe String -> [String] -> P String -> P [Token Lexeme] +close' reopenBlockname open closeP = do + Token close pos1 pos2 <- tokenP closeP + env <- S.get + case findClose open (layout env) of + Nothing -> err pos1 (CloseWithoutMatchingOpen msgOpen (quote close)) + where + msgOpen = List.intercalate " or " (quote <$> open) + quote s = "'" <> s <> "'" + Just (_, n) -> do + S.put (env {layout = drop n (layout env), opening = reopenBlockname}) + let opens = maybe [] (const $ [Token (Open close) pos1 pos2]) reopenBlockname + pure $ replicate n (Token Close pos1 pos2) ++ opens + +findClose :: [String] -> Layout -> Maybe (String, Int) +findClose _ [] = Nothing +findClose s ((h, _) : tl) = if h `elem` s then Just (h, 1) else fmap (1 +) <$> findClose s tl + +notLayout :: Token Lexeme -> Bool +notLayout t = case payload t of + Close -> False + Semi _ -> False + Open _ -> False + _ -> True + +top :: Layout -> Column +top [] = 1 +top ((_, h) : _) = h + +topLeftCorner :: Pos +topLeftCorner = Pos 1 1 + +data BlockTree a + = Block + -- | The token that opens the block + a + -- | “Stanzas” of nested tokens + [[BlockTree a]] + -- | The closing token, if any + (Maybe a) + | Leaf a + deriving (Functor, Foldable, Traversable) + +headToken :: BlockTree a -> a +headToken (Block a _ _) = a +headToken (Leaf a) = a + +instance (Show a) => Show (BlockTree a) where + show (Leaf a) = show a + show (Block open mid close) = + show open + ++ "\n" + ++ indent " " (intercalateMap "\n" (intercalateMap " " show) mid) + ++ "\n" + ++ maybe "" show close + where + indent by s = by ++ (s >>= go by) + go by '\n' = '\n' : by + go _ c = [c] + +reorderTree :: ([[BlockTree a]] -> [[BlockTree a]]) -> BlockTree a -> BlockTree a +reorderTree f (Block open mid close) = Block open (f (fmap (reorderTree f) <$> mid)) close +reorderTree _ l = l + +tree :: [Token Lexeme] -> BlockTree (Token Lexeme) +tree toks = one toks const + where + one (open@(payload -> Open _) : ts) k = many (Block open . stanzas) [] ts k + one (t : ts) k = k (Leaf t) ts + one [] k = k lastErr [] + where + lastErr = Leaf case drop (length toks - 1) toks of + [] -> Token (Err LayoutError) topLeftCorner topLeftCorner + (t : _) -> t {payload = Err LayoutError} + + many open acc [] k = k (open (reverse acc) Nothing) [] + many open acc (t@(payload -> Close) : ts) k = k (open (reverse acc) $ pure t) ts + many open acc ts k = one ts $ \t ts -> many open (t : acc) ts k + +stanzas :: [BlockTree (Token Lexeme)] -> [[BlockTree (Token Lexeme)]] +stanzas = + toList + . foldr + ( \tok (curr :| stanzas) -> case tok of + Leaf (Token (Semi _) _ _) -> [tok] :| curr : stanzas + _ -> (tok : curr) :| stanzas + ) + ([] :| []) + +-- Moves type and ability declarations to the front of the token stream +-- and move `use` statements to the front of each block +reorder :: [[BlockTree (Token Lexeme)]] -> [[BlockTree (Token Lexeme)]] +reorder = foldr fixup [] . sortWith f + where + f [] = 3 :: Int + f (t0 : _) = case payload $ headToken t0 of + Open mod | Set.member (Text.pack mod) typeModifiers -> 1 + Open typOrA | Set.member (Text.pack typOrA) typeOrAbility -> 1 + Reserved "use" -> 0 + _ -> 3 :: Int + -- after reordering can end up with trailing semicolon at the end of + -- a block, which we remove with this pass + fixup stanza [] = case Lens.unsnoc stanza of + Nothing -> [] + -- remove any trailing `Semi` from the last non-empty stanza + Just (init, Leaf (Token (Semi _) _ _)) -> [init] + -- don’t touch other stanzas + Just (_, _) -> [stanza] + fixup stanza tail = stanza : tail + +-- | This turns the lexeme stream into a tree, reordering some lexeme subsequences. +preParse :: [Token Lexeme] -> BlockTree (Token Lexeme) +preParse = reorderTree reorder . tree + +-- | A few transformations that happen between lexing and parsing. +-- +-- All of these things should move out of the lexer, and be applied in the parse. +postLex :: [Token Lexeme] -> [Token Lexeme] +postLex = toList . preParse . foldr tweak [] + +isDelayOrForce :: Char -> Bool +isDelayOrForce op = op == '\'' || op == '!' + +-- Mapping between characters and their escape codes. Use parse/showEscapeChar to convert. +escapeChars :: [(Char, Char)] +escapeChars = + [ ('0', '\0'), + ('a', '\a'), + ('b', '\b'), + ('f', '\f'), + ('n', '\n'), + ('r', '\r'), + ('t', '\t'), + ('v', '\v'), + ('s', ' '), + ('\'', '\''), + ('"', '"'), + ('\\', '\\') + ] + +-- Inverse of parseEscapeChar; map a character to its escaped version: +showEscapeChar :: Char -> Maybe Char +showEscapeChar c = + Map.lookup c (Map.fromList [(x, y) | (y, x) <- escapeChars]) + +debugFilePreParse :: FilePath -> IO () +debugFilePreParse file = putStrLn . debugPreParse . preParse . lexer file . Text.unpack =<< readUtf8 file + +debugPreParse :: BlockTree (Token Lexeme) -> String +debugPreParse (Leaf (Token (Err (UnexpectedTokens msg)) start end)) = + (if start == end then msg1 else msg2) <> ":\n" <> msg + where + msg1 = "Error on line " <> show (line start) <> ", column " <> show (column start) + msg2 = + "Error on line " + <> show (line start) + <> ", column " + <> show (column start) + <> " - line " + <> show (line end) + <> ", column " + <> show (column end) +debugPreParse ts = show $ payload <$> ts + +debugPreParse' :: String -> String +debugPreParse' = debugPreParse . preParse . lexer "debugPreParse" + +instance P.VisualStream [Token Lexeme] where + showTokens _ xs = + join . Nel.toList . S.evalState (traverse go xs) . end $ Nel.head xs + where + go :: Token Lexeme -> S.State Pos String + go tok = do + prev <- S.get + S.put $ end tok + pure $ pad prev (start tok) ++ pretty (payload tok) + pretty (Open s) = s + pretty (Reserved w) = w + pretty (Textual t) = '"' : t ++ ['"'] + pretty (Character c) = + case showEscapeChar c of + Just c -> "?\\" ++ [c] + Nothing -> '?' : [c] + pretty (WordyId n) = Text.unpack (HQ'.toText n) + pretty (SymbolyId n) = Text.unpack (HQ'.toText n) + pretty (Blank s) = "_" ++ s + pretty (Numeric n) = n + pretty (Hash sh) = show sh + pretty (Err e) = show e + pretty (Bytes bs) = "0xs" <> show bs + pretty Close = "" + pretty (Semi True) = "" + pretty (Semi False) = ";" + pretty (Doc d) = show d + pad (Pos line1 col1) (Pos line2 col2) = + if line1 == line2 + then replicate (col2 - col1) ' ' + else replicate (line2 - line1) '\n' ++ replicate col2 ' ' diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 1bee4d08f4..098caab1b6 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -88,8 +88,9 @@ import Unison.Pattern qualified as Pattern import Unison.Prelude import Unison.Reference (Reference) import Unison.Referent (Referent) -import Unison.Syntax.Lexer qualified as L +import Unison.Syntax.Lexer.Unison qualified as L import Unison.Syntax.Name qualified as Name (toVar, unsafeParseText) +import Unison.Syntax.Parser.Doc qualified as Doc import Unison.Syntax.Parser.Doc.Data qualified as Doc import Unison.Term (MatchCase (..)) import Unison.UnisonFile.Error qualified as UF @@ -400,7 +401,7 @@ string = queryToken getString getString (L.Textual s) = Just (Text.pack s) getString _ = Nothing -doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection L.DocTree)) +doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection (Doc.Tree [L.Token L.Lexeme]))) doc = queryToken \case L.Doc d -> pure d _ -> Nothing diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs new file mode 100644 index 0000000000..5ca747f204 --- /dev/null +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -0,0 +1,476 @@ +module Unison.Syntax.Parser.Doc where + +import Control.Comonad.Cofree (Cofree ((:<))) +import Control.Monad.State qualified as S +import Data.Char (isControl, isSpace) +import Data.List qualified as List +import Data.List.Extra qualified as List +import Data.List.NonEmpty (NonEmpty ((:|))) +import Data.List.NonEmpty qualified as NonEmpty +import Data.Text qualified as Text +import Text.Megaparsec qualified as P +import Text.Megaparsec.Char (char) +import Text.Megaparsec.Char qualified as CP +import Text.Megaparsec.Char.Lexer qualified as LP +import Unison.Parser.Ann (Ann, Annotated (..)) +import Unison.Prelude +import Unison.Syntax.Lexer + ( P, + ParsingEnv (..), + column, + identifierP, + line, + lit, + local, + sepBy1', + separated, + some', + someTill', + typeOrAbilityAlt, + wordySep, + (<+>), + ) +import Unison.Syntax.Lexer.Token (Token (Token), posP, tokenP) +import Unison.Syntax.Parser.Doc.Data + +type Tree code = Cofree (Top code) Ann + +-- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that +-- Unison wraps `Doc` literals in `}}`). +untitledSection :: forall code. (Annotated code) => (P () -> P code) -> P () -> P (UntitledSection (Tree code)) +untitledSection code docClose = UntitledSection <$> P.many (sectionElem <* CP.space) + where + wordyKw kw = separated wordySep (lit kw) + sectionElem = section <|> fencedBlock <|> list <|> paragraph + paragraph = wrap' . Paragraph <$> spaced leaf + reserved word = List.isPrefixOf "}}" word || all (== '#') word + + wordy :: P end -> P (Leaf code void) + wordy closing = fmap Word . tokenP . P.try $ do + let end = + P.lookAhead $ + docClose + <|> void (P.satisfy isSpace) + <|> void closing + word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end + guard (not $ reserved word || null word) + pure word + + leafy closing = groupy closing gs + where + gs = + link + <|> externalLink + <|> exampleInline + <|> expr + <|> boldOrItalicOrStrikethrough closing + <|> verbatim + <|> atDoc + <|> wordy closing + + leaf = leafy mzero + + atDoc = src <|> evalInline <|> signature <|> signatureInline + where + comma = lit "," <* CP.space + src = + src' Source "@source" + <|> src' FoldedSource "@foldedSource" + srcElem = + SourceElement + <$> (typeLink <|> termLink) + <*> ( fmap (fromMaybe []) . P.optional $ + (lit "@") *> (CP.space *> annotations) + ) + where + annotation = fmap Left (tokenP identifierP) <|> fmap Right expr <* CP.space + annotations = + P.some (EmbedAnnotation <$> annotation) + src' name atName = fmap name $ do + _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space + s <- sepBy1' srcElem comma + _ <- lit "}" + pure s + signature = fmap Signature $ do + _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space + s <- sepBy1' signatureLink comma + _ <- lit "}" + pure s + signatureInline = fmap SignatureInline $ do + _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space + s <- signatureLink + _ <- lit "}" + pure s + evalInline = fmap EvalInline $ do + _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space + let inlineEvalClose = () <$ lit "}" + s <- code inlineEvalClose + pure s + + typeLink = fmap EmbedTypeLink $ do + _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space + tokenP identifierP <* CP.space + + termLink = + fmap EmbedTermLink $ + tokenP identifierP <* CP.space + + signatureLink = + fmap EmbedSignatureLink $ + tokenP identifierP <* CP.space + + groupy closing p = do + Token p _ _ <- tokenP p + after <- P.optional . P.try $ leafy closing + pure $ case after of + Nothing -> p + Just after -> + Group + . Join + $ p + :| pure after + + verbatim = + P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do + Token originalText start stop <- tokenP do + -- a single backtick followed by a non-backtick is treated as monospaced + let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) + -- also two or more ' followed by that number of closing ' + quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) + P.someTill P.anySingle (lit quotes) + let isMultiLine = line start /= line stop + if isMultiLine + then do + let trimmed = (trimAroundDelimiters originalText) + let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed + -- If it's a multi-line verbatim block we trim any whitespace representing + -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' + pure . Verbatim $ + Word $ + Token txt start stop + else + pure . Code $ + Word $ + Token originalText start stop + + exampleInline = + P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ + fmap Example $ do + n <- P.try $ do + _ <- lit "`" + length <$> P.takeWhile1P (Just "backticks") (== '`') + let end = () <$ lit (replicate (n + 1) '`') + ex <- CP.space *> code end + pure ex + + link = + P.label "link (examples: {type List}, {Nat.+})" $ + fmap Link $ + P.try $ + lit "{" *> (typeLink <|> termLink) <* lit "}" + + expr :: P (Leaf code x) + expr = + fmap Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ + lit "{{" *> code (() <$ lit "}}") + + nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' + nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace + + -- Allows whitespace or a newline, but not more than two newlines in a row. + whitespaceWithoutParagraphBreak :: P () + whitespaceWithoutParagraphBreak = void do + void nonNewlineSpaces + optional newline >>= \case + Just _ -> void nonNewlineSpaces + Nothing -> pure () + + fencedBlock = + P.label "block eval (syntax: a fenced code block)" $ + evalUnison <|> exampleBlock <|> other + where + evalUnison = fmap (wrap' . Eval) $ do + -- commit after seeing that ``` is on its own line + fence <- P.try $ do + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) + fence <$ guard b + CP.space + *> code (() <$ lit fence) + + exampleBlock = fmap (wrap' . ExampleBlock) $ do + void $ lit "@typecheck" <* CP.space + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + code (() <$ lit fence) + + uncolumn column tabWidth s = + let skip col r | col < 1 = r + skip col s@('\t' : _) | col < tabWidth = s + skip col ('\t' : r) = skip (col - tabWidth) r + skip col (c : r) + | isSpace c && (not $ isControl c) = + skip (col - 1) r + skip _ s = s + in List.intercalate "\n" $ skip column <$> lines s + + other = fmap (uncurry $ wrapSimple2 CodeBlock) $ do + column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel + let tabWidth = toInteger . P.unPos $ P.defaultTabWidth + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + name <- + P.takeWhileP Nothing nonNewlineSpace + *> tokenP (P.takeWhile1P Nothing (not . isSpace)) + <* P.takeWhileP Nothing nonNewlineSpace + _ <- void CP.eol + verbatim <- + tokenP $ + uncolumn column tabWidth . trimAroundDelimiters + <$> P.someTill P.anySingle ([] <$ lit fence) + pure (name, verbatim) + + boldOrItalicOrStrikethrough closing = do + let start = + some (P.satisfy (== '*')) + <|> some (P.satisfy (== '_')) + <|> some + (P.satisfy (== '~')) + name s = + if take 1 s == "~" + then Strikethrough + else if take 1 s == "*" then Bold else Italic + end <- P.try $ do + end <- start + P.lookAhead (P.satisfy (not . isSpace)) + pure end + name end . wrap' . Paragraph + <$> someTill' + (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) + (lit end) + + externalLink = + P.label "hyperlink (example: [link name](https://destination.com))" $ + fmap (uncurry NamedLink) $ do + _ <- lit "[" + p <- leafies (void $ char ']') + _ <- lit "]" + _ <- lit "(" + target <- + fmap (Group . Join) $ + fmap pure link <|> some' (expr <|> wordy (char ')')) + _ <- lit ")" + pure (p, target) + + -- newline = P.optional (lit "\r") *> lit "\n" + + sp = P.try $ do + spaces <- P.takeWhile1P (Just "space") isSpace + close <- P.optional (P.lookAhead (lit "}}")) + case close of + Nothing -> guard $ ok spaces + Just _ -> pure () + pure spaces + where + ok s = length [() | '\n' <- s] < 2 + + spaced p = some' (p <* P.optional sp) + leafies close = wrap' . Paragraph <$> spaced (leafy close) + + list = bulletedList <|> numberedList + + bulletedList = wrap' . BulletedList <$> sepBy1' bullet listSep + numberedList = wrap' . NumberedList <$> sepBy1' numberedItem listSep + + listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) + + bulletedStart = P.try $ do + r <- listItemStart' $ [] <$ P.satisfy bulletChar + P.lookAhead (P.satisfy isSpace) + pure r + where + bulletChar ch = ch == '*' || ch == '-' || ch == '+' + + listItemStart' :: P a -> P (Int, a) + listItemStart' gutter = P.try $ do + nonNewlineSpaces + col <- column <$> posP + parentCol <- S.gets parentListColumn + guard (col > parentCol) + (col,) <$> gutter + + numberedStart = + listItemStart' $ P.try (tokenP $ LP.decimal <* lit ".") + + listItemParagraph = fmap (wrap' . Paragraph) $ do + col <- column <$> posP + some' (leaf <* sep col) + where + -- Trickiness here to support hard line breaks inside of + -- a bulleted list, so for instance this parses as expected: + -- + -- * uno dos + -- tres quatro + -- * alice bob + -- carol dave eve + sep col = do + _ <- nonNewlineSpaces + _ <- + P.optional . P.try $ + newline + *> nonNewlineSpaces + *> do + col2 <- column <$> posP + guard $ col2 >= col + (P.notFollowedBy $ void numberedStart <|> void bulletedStart) + pure () + + numberedItem = P.label msg $ do + (col, s) <- numberedStart + (s,) + <$> ( fmap (uncurry Column) $ do + p <- nonNewlineSpaces *> listItemParagraph + subList <- + local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) + pure (p, subList) + ) + where + msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" + + bullet = fmap (uncurry Column) . P.label "bullet (examples: * item1, - item2)" $ do + (col, _) <- bulletedStart + p <- nonNewlineSpaces *> listItemParagraph + subList <- + local + (\e -> e {parentListColumn = col}) + (P.optional $ listSep *> list) + pure (p, subList) + + newline = P.label "newline" $ lit "\n" <|> lit "\r\n" + + -- ## Section title + -- + -- A paragraph under this section. + -- Part of the same paragraph. Blanklines separate paragraphs. + -- + -- ### A subsection title + -- + -- A paragraph under this subsection. + + -- # A section title (not a subsection) + section :: P (Tree code) + section = fmap (wrap' . uncurry Section) $ do + ns <- S.gets parentSections + hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp + title <- paragraph <* CP.space + let m = length hashes + head ns + body <- + local (\env -> env {parentSections = (m : (tail ns))}) $ + P.many (sectionElem <* CP.space) + pure $ (title, body) + + wrap' :: Top code (Tree code) -> Tree code + wrap' doc = ann doc :< doc + + wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> Top code (Tree code)) -> a -> b -> Tree code + wrapSimple2 fn a b = ann a <> ann b :< fn a b + +-- | If it's a multi-line verbatim block we trim any whitespace representing +-- indentation from the pretty-printer. +-- +-- E.g. +-- +-- @@ +-- {{ +-- # Heading +-- ''' +-- code +-- indented +-- ''' +-- }} +-- @@ +-- +-- Should lex to the text literal "code\n indented". +-- +-- If there's text in the literal that has LESS trailing whitespace than the +-- opening delimiters, we don't trim it at all. E.g. +-- +-- @@ +-- {{ +-- # Heading +-- ''' +-- code +-- ''' +-- }} +-- @@ +-- +-- Is parsed as " code". +-- +-- Trim the expected amount of whitespace from a text literal: +-- >>> trimIndentFromVerbatimBlock 2 " code\n indented" +-- "code\n indented" +-- +-- If the text literal has less leading whitespace than the opening delimiters, +-- leave it as-is +-- >>> trimIndentFromVerbatimBlock 2 "code\n indented" +-- "code\n indented" +trimIndentFromVerbatimBlock :: Int -> String -> String +trimIndentFromVerbatimBlock leadingSpaces txt = fromMaybe txt $ do + List.intercalate "\n" <$> for (lines txt) \line -> do + -- If any 'stripPrefix' fails, we fail and return the unaltered text + case List.stripPrefix (replicate leadingSpaces ' ') line of + Just stripped -> Just stripped + Nothing -> + -- If it was a line with all white-space, just use an empty line, + -- this can happen easily in editors which trim trailing whitespace. + if all isSpace line + then Just "" + else Nothing + +-- Trim leading/trailing whitespace from around delimiters, e.g. +-- +-- {{ +-- '''___ <- whitespace here including newline +-- text block +-- 👇 or here +-- __''' +-- }} +-- >>> trimAroundDelimiters " \n text block \n " +-- " text block " +-- +-- Should leave leading and trailing line untouched if it contains non-whitespace, e.g.: +-- +-- ''' leading whitespace +-- text block +-- trailing whitespace: ''' +-- >>> trimAroundDelimiters " leading whitespace\n text block \ntrailing whitespace: " +-- " leading whitespace\n text block \ntrailing whitespace: " +-- +-- Should keep trailing newline if it's the only thing on the line, e.g.: +-- +-- ''' +-- newline below +-- +-- ''' +-- >>> trimAroundDelimiters "\nnewline below\n\n" +-- "newline below\n\n" +trimAroundDelimiters :: String -> String +trimAroundDelimiters txt = + txt + & ( \s -> + List.breakOn "\n" s + & \case + (prefix, suffix) + | all isSpace prefix -> drop 1 suffix + | otherwise -> prefix <> suffix + ) + & ( \s -> + List.breakOnEnd "\n" s + & \case + (_prefix, "") -> s + (prefix, suffix) + | all isSpace suffix -> dropTrailingNewline prefix + | otherwise -> prefix <> suffix + ) + where + dropTrailingNewline = \case + [] -> [] + (x : xs) -> NonEmpty.init (x NonEmpty.:| xs) diff --git a/unison-syntax/test/Main.hs b/unison-syntax/test/Main.hs index 5c13940b0a..b7235f299b 100644 --- a/unison-syntax/test/Main.hs +++ b/unison-syntax/test/Main.hs @@ -10,7 +10,7 @@ import Unison.Prelude import Unison.ShortHash (ShortHash) import Unison.ShortHash qualified as ShortHash import Unison.Syntax.HashQualifiedPrime qualified as HQ' (unsafeParseText) -import Unison.Syntax.Lexer +import Unison.Syntax.Lexer.Unison main :: IO () main = diff --git a/unison-syntax/unison-syntax.cabal b/unison-syntax/unison-syntax.cabal index 853da4c817..0da37d0036 100644 --- a/unison-syntax/unison-syntax.cabal +++ b/unison-syntax/unison-syntax.cabal @@ -23,9 +23,11 @@ library Unison.Syntax.HashQualifiedPrime Unison.Syntax.Lexer Unison.Syntax.Lexer.Token + Unison.Syntax.Lexer.Unison Unison.Syntax.Name Unison.Syntax.NameSegment Unison.Syntax.Parser + Unison.Syntax.Parser.Doc Unison.Syntax.Parser.Doc.Data Unison.Syntax.ReservedWords Unison.Syntax.ShortHash From e9512a69ce03e137758f8adc81dae04c422260d9 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Fri, 26 Jul 2024 12:07:05 -0600 Subject: [PATCH 28/50] Split the Doc parser into multiple functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In general, they map to the constructors of the Doc types, with some wiggle room for now. It’s probably beneficial to review this commit by ignoring whitespace. --- .../src/Unison/Syntax/Lexer/Unison.hs | 2 +- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 672 ++++++++++-------- 2 files changed, 375 insertions(+), 299 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs index dcaf9ca6d3..86e75b1afe 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs @@ -313,7 +313,7 @@ doc2 = do } ) do - body <- Doc.untitledSection lexemes' . P.lookAhead $ () <$ lit "}}" + body <- Doc.untitledSection . Doc.sectionElem lexemes' . P.lookAhead $ () <$ lit "}}" closeStart <- posP lit "}}" closeEnd <- posP diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs index 5ca747f204..99122bd5ff 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -1,4 +1,44 @@ -module Unison.Syntax.Parser.Doc where +module Unison.Syntax.Parser.Doc + ( Tree, + untitledSection, + sectionElem, + leaf, + + -- * section elements + section, + eval, + exampleBlock, + codeBlock, + list, + bulletedList, + numberedList, + paragraph, + + -- * leaves + link, + namedLink, + example, + transclude, + bold, + italic, + strikethrough, + verbatim, + source, + foldedSource, + evalInline, + signatures, + signatureInline, + group, + word, + + -- * other components + column', + embedTypeLink, + embedTermLink, + embedSignatureLink, + join, + ) +where import Control.Comonad.Cofree (Cofree ((:<))) import Control.Monad.State qualified as S @@ -13,7 +53,7 @@ import Text.Megaparsec.Char (char) import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Unison.Parser.Ann (Ann, Annotated (..)) -import Unison.Prelude +import Unison.Prelude hiding (join) import Unison.Syntax.Lexer ( P, ParsingEnv (..), @@ -37,146 +77,221 @@ type Tree code = Cofree (Top code) Ann -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). -untitledSection :: forall code. (Annotated code) => (P () -> P code) -> P () -> P (UntitledSection (Tree code)) -untitledSection code docClose = UntitledSection <$> P.many (sectionElem <* CP.space) +untitledSection :: P a -> P (UntitledSection a) +untitledSection a = UntitledSection <$> P.many (a <* CP.space) + +wordyKw :: String -> P String +wordyKw kw = separated wordySep (lit kw) + +sectionElem :: (Annotated code) => (P () -> P code) -> P () -> P (Tree code) +sectionElem code docClose = + fmap wrap' $ + section code docClose + <|> P.label "block eval (syntax: a fenced code block)" (eval code <|> exampleBlock code <|> codeBlock) + <|> list code docClose + <|> paragraph code docClose + +paragraph :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) +paragraph code = fmap Paragraph . spaced . leafy code + +word :: P end -> P (Leaf code void) +word closing = fmap Word . tokenP . P.try $ do + let end = P.lookAhead $ void (P.satisfy isSpace) <|> void closing + word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end + guard (not $ reserved word || null word) + pure word where - wordyKw kw = separated wordySep (lit kw) - sectionElem = section <|> fencedBlock <|> list <|> paragraph - paragraph = wrap' . Paragraph <$> spaced leaf reserved word = List.isPrefixOf "}}" word || all (== '#') word - wordy :: P end -> P (Leaf code void) - wordy closing = fmap Word . tokenP . P.try $ do - let end = - P.lookAhead $ - docClose - <|> void (P.satisfy isSpace) - <|> void closing - word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end - guard (not $ reserved word || null word) - pure word - - leafy closing = groupy closing gs - where - gs = - link - <|> externalLink - <|> exampleInline - <|> expr - <|> boldOrItalicOrStrikethrough closing - <|> verbatim - <|> atDoc - <|> wordy closing - - leaf = leafy mzero - - atDoc = src <|> evalInline <|> signature <|> signatureInline +leaf :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +leaf code closing = + do + link + <|> namedLink code closing + <|> example code + <|> transclude code + <|> bold code closing + <|> italic code closing + <|> strikethrough code closing + <|> verbatim + <|> source code + <|> foldedSource code + <|> evalInline code + <|> signatures + <|> signatureInline + <|> word closing + +leafy :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +leafy code closing = do + p <- leaf code closing + after <- P.optional . P.try $ leafy code closing + case after of + Nothing -> pure p + Just after -> group . pure $ p :| pure after + +comma :: P String +comma = lit "," <* CP.space + +source :: (P () -> P code) -> P (Leaf code a) +source = fmap Source . (lit "@source" *>) . sourceElements + +foldedSource :: (P () -> P code) -> P (Leaf code a) +foldedSource = fmap FoldedSource . (lit "@foldedSource" *>) . sourceElements + +sourceElements :: (P () -> P code) -> P (NonEmpty (SourceElement (Leaf code Void))) +sourceElements code = do + _ <- (lit " {" <|> lit "{") *> CP.space + s <- sepBy1' srcElem comma + _ <- lit "}" + pure s + where + srcElem = + SourceElement + <$> embedLink + <*> ( fmap (fromMaybe []) . P.optional $ + (lit "@") *> (CP.space *> annotations) + ) where - comma = lit "," <* CP.space - src = - src' Source "@source" - <|> src' FoldedSource "@foldedSource" - srcElem = - SourceElement - <$> (typeLink <|> termLink) - <*> ( fmap (fromMaybe []) . P.optional $ - (lit "@") *> (CP.space *> annotations) - ) - where - annotation = fmap Left (tokenP identifierP) <|> fmap Right expr <* CP.space - annotations = - P.some (EmbedAnnotation <$> annotation) - src' name atName = fmap name $ do - _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space - s <- sepBy1' srcElem comma - _ <- lit "}" - pure s - signature = fmap Signature $ do - _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space - s <- sepBy1' signatureLink comma - _ <- lit "}" - pure s - signatureInline = fmap SignatureInline $ do - _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space - s <- signatureLink - _ <- lit "}" - pure s - evalInline = fmap EvalInline $ do - _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space - let inlineEvalClose = () <$ lit "}" - s <- code inlineEvalClose - pure s - - typeLink = fmap EmbedTypeLink $ do - _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tokenP identifierP <* CP.space - - termLink = - fmap EmbedTermLink $ - tokenP identifierP <* CP.space - - signatureLink = - fmap EmbedSignatureLink $ - tokenP identifierP <* CP.space - - groupy closing p = do - Token p _ _ <- tokenP p - after <- P.optional . P.try $ leafy closing - pure $ case after of - Nothing -> p - Just after -> - Group - . Join - $ p - :| pure after - - verbatim = - P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do - Token originalText start stop <- tokenP do - -- a single backtick followed by a non-backtick is treated as monospaced - let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) - -- also two or more ' followed by that number of closing ' - quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) - P.someTill P.anySingle (lit quotes) - let isMultiLine = line start /= line stop - if isMultiLine - then do - let trimmed = (trimAroundDelimiters originalText) - let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed - -- If it's a multi-line verbatim block we trim any whitespace representing - -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - pure . Verbatim $ - Word $ - Token txt start stop - else - pure . Code $ - Word $ - Token originalText start stop - - exampleInline = - P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ - fmap Example $ do - n <- P.try $ do - _ <- lit "`" - length <$> P.takeWhile1P (Just "backticks") (== '`') - let end = () <$ lit (replicate (n + 1) '`') - ex <- CP.space *> code end - pure ex - - link = - P.label "link (examples: {type List}, {Nat.+})" $ - fmap Link $ - P.try $ - lit "{" *> (typeLink <|> termLink) <* lit "}" - - expr :: P (Leaf code x) - expr = - fmap Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ - lit "{{" *> code (() <$ lit "}}") - + annotation = fmap Left (tokenP identifierP) <|> fmap Right (transclude code) <* CP.space + annotations = + P.some (EmbedAnnotation <$> annotation) + +signatures :: P (Leaf code a) +signatures = fmap Signature $ do + _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space + s <- sepBy1' embedSignatureLink comma + _ <- lit "}" + pure s + +signatureInline :: P (Leaf code a) +signatureInline = fmap SignatureInline $ do + _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space + s <- embedSignatureLink + _ <- lit "}" + pure s + +evalInline :: (P () -> P a1) -> P (Leaf a1 a2) +evalInline code = fmap EvalInline $ do + _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space + let inlineEvalClose = void $ lit "}" + s <- code inlineEvalClose + pure s + +embedTypeLink :: P EmbedLink +embedTypeLink = + EmbedTypeLink <$> do + _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space + tokenP identifierP <* CP.space + +embedTermLink :: P EmbedLink +embedTermLink = EmbedTermLink <$> tokenP identifierP <* CP.space + +embedSignatureLink :: P EmbedSignatureLink +embedSignatureLink = EmbedSignatureLink <$> tokenP identifierP <* CP.space + +verbatim :: P (Leaf code a) +verbatim = + P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do + Token originalText start stop <- tokenP do + -- a single backtick followed by a non-backtick is treated as monospaced + let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) + -- also two or more ' followed by that number of closing ' + quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) + P.someTill P.anySingle (lit quotes) + let isMultiLine = line start /= line stop + if isMultiLine + then do + let trimmed = (trimAroundDelimiters originalText) + let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed + -- If it's a multi-line verbatim block we trim any whitespace representing + -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' + pure . Verbatim $ + Word $ + Token txt start stop + else + pure . Code $ + Word $ + Token originalText start stop + +example :: (P () -> P code) -> P (Leaf code void) +example code = + P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ + fmap Example $ do + n <- P.try $ do + _ <- lit "`" + length <$> P.takeWhile1P (Just "backticks") (== '`') + let end = void . lit $ replicate (n + 1) '`' + CP.space *> code end + +link :: P (Leaf a b) +link = P.label "link (examples: {type List}, {Nat.+})" $ Link <$> P.try (lit "{" *> embedLink <* lit "}") + +transclude :: (P () -> P code) -> P (Leaf code x) +transclude code = + fmap Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ + lit "{{" *> code (void $ lit "}}") + +nonNewlineSpaces :: P String +nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace + where nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' - nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace +eval :: (Annotated code) => (P () -> P code) -> P (Top code (Tree code)) +eval code = + Eval <$> do + -- commit after seeing that ``` is on its own line + fence <- P.try $ do + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) + fence <$ guard b + CP.space *> code (void $ lit fence) + +exampleBlock :: (Annotated code) => (P () -> P code) -> P (Top code (Tree code)) +exampleBlock code = + ExampleBlock + <$> do + void $ lit "@typecheck" <* CP.space + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + code . void $ lit fence + +codeBlock :: P (Top code (Tree code)) +codeBlock = do + column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel + let tabWidth = toInteger . P.unPos $ P.defaultTabWidth + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + name <- + nonNewlineSpaces + *> tokenP (P.takeWhile1P Nothing (not . isSpace)) + <* nonNewlineSpaces + _ <- void CP.eol + verbatim <- + tokenP $ + uncolumn column tabWidth . trimAroundDelimiters + <$> P.someTill P.anySingle ([] <$ lit fence) + pure $ CodeBlock name verbatim + where + uncolumn column tabWidth s = + let skip col r | col < 1 = r + skip col s@('\t' : _) | col < tabWidth = s + skip col ('\t' : r) = skip (col - tabWidth) r + skip col (c : r) + | isSpace c && (not $ isControl c) = + skip (col - 1) r + skip _ s = s + in List.intercalate "\n" $ skip column <$> lines s + +emphasis :: (Annotated code) => Char -> (P () -> P code) -> P () -> P (Tree code) +emphasis delimiter code closing = do + let start = some (P.satisfy (== delimiter)) + end <- P.try $ do + end <- start + P.lookAhead (P.satisfy (not . isSpace)) + pure end + wrap' . Paragraph + <$> someTill' + (leafy code (closing <|> (void $ lit end)) <* void whitespaceWithoutParagraphBreak) + (lit end) + where -- Allows whitespace or a newline, but not more than two newlines in a row. whitespaceWithoutParagraphBreak :: P () whitespaceWithoutParagraphBreak = void do @@ -185,124 +300,92 @@ untitledSection code docClose = UntitledSection <$> P.many (sectionElem <* CP.sp Just _ -> void nonNewlineSpaces Nothing -> pure () - fencedBlock = - P.label "block eval (syntax: a fenced code block)" $ - evalUnison <|> exampleBlock <|> other - where - evalUnison = fmap (wrap' . Eval) $ do - -- commit after seeing that ``` is on its own line - fence <- P.try $ do - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) - fence <$ guard b - CP.space - *> code (() <$ lit fence) - - exampleBlock = fmap (wrap' . ExampleBlock) $ do - void $ lit "@typecheck" <* CP.space - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - code (() <$ lit fence) - - uncolumn column tabWidth s = - let skip col r | col < 1 = r - skip col s@('\t' : _) | col < tabWidth = s - skip col ('\t' : r) = skip (col - tabWidth) r - skip col (c : r) - | isSpace c && (not $ isControl c) = - skip (col - 1) r - skip _ s = s - in List.intercalate "\n" $ skip column <$> lines s - - other = fmap (uncurry $ wrapSimple2 CodeBlock) $ do - column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel - let tabWidth = toInteger . P.unPos $ P.defaultTabWidth - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - name <- - P.takeWhileP Nothing nonNewlineSpace - *> tokenP (P.takeWhile1P Nothing (not . isSpace)) - <* P.takeWhileP Nothing nonNewlineSpace - _ <- void CP.eol - verbatim <- - tokenP $ - uncolumn column tabWidth . trimAroundDelimiters - <$> P.someTill P.anySingle ([] <$ lit fence) - pure (name, verbatim) - - boldOrItalicOrStrikethrough closing = do - let start = - some (P.satisfy (== '*')) - <|> some (P.satisfy (== '_')) - <|> some - (P.satisfy (== '~')) - name s = - if take 1 s == "~" - then Strikethrough - else if take 1 s == "*" then Bold else Italic - end <- P.try $ do - end <- start - P.lookAhead (P.satisfy (not . isSpace)) - pure end - name end . wrap' . Paragraph - <$> someTill' - (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) - (lit end) - - externalLink = - P.label "hyperlink (example: [link name](https://destination.com))" $ - fmap (uncurry NamedLink) $ do - _ <- lit "[" - p <- leafies (void $ char ']') - _ <- lit "]" - _ <- lit "(" - target <- - fmap (Group . Join) $ - fmap pure link <|> some' (expr <|> wordy (char ')')) - _ <- lit ")" - pure (p, target) - - -- newline = P.optional (lit "\r") *> lit "\n" - - sp = P.try $ do - spaces <- P.takeWhile1P (Just "space") isSpace - close <- P.optional (P.lookAhead (lit "}}")) - case close of - Nothing -> guard $ ok spaces - Just _ -> pure () - pure spaces - where - ok s = length [() | '\n' <- s] < 2 +bold :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +bold code = fmap Bold . emphasis '*' code + +italic :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +italic code = fmap Italic . emphasis '_' code + +strikethrough :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +strikethrough code = fmap Strikethrough . emphasis '~' code + +namedLink :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +namedLink code docClose = + P.label "hyperlink (example: [link name](https://destination.com))" do + _ <- lit "[" + p <- spaced . leafy code . void $ char ']' + _ <- lit "]" + _ <- lit "(" + target <- group $ fmap pure link <|> some' (transclude code <|> word (docClose <|> void (char ')'))) + _ <- lit ")" + pure $ NamedLink (wrap' $ Paragraph p) target + +sp :: P String +sp = P.try $ do + spaces <- P.takeWhile1P (Just "space") isSpace + close <- P.optional (P.lookAhead (lit "}}")) + case close of + Nothing -> guard $ ok spaces + Just _ -> pure () + pure spaces + where + ok s = length [() | '\n' <- s] < 2 + +spaced :: P a -> P (NonEmpty a) +spaced p = some' (p <* P.optional sp) - spaced p = some' (p <* P.optional sp) - leafies close = wrap' . Paragraph <$> spaced (leafy close) +-- | Not an actual node, but this pattern is referenced in multiple places +list :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) +list code docClose = bulletedList code docClose <|> numberedList code docClose - list = bulletedList <|> numberedList +listSep :: P () +listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) - bulletedList = wrap' . BulletedList <$> sepBy1' bullet listSep - numberedList = wrap' . NumberedList <$> sepBy1' numberedItem listSep +bulletedStart :: P (Int, [a]) +bulletedStart = P.try $ do + r <- listItemStart $ [] <$ P.satisfy bulletChar + P.lookAhead (P.satisfy isSpace) + pure r + where + bulletChar ch = ch == '*' || ch == '-' || ch == '+' + +listItemStart :: P a -> P (Int, a) +listItemStart gutter = P.try $ do + nonNewlineSpaces + col <- column <$> posP + parentCol <- S.gets parentListColumn + guard (col > parentCol) + (col,) <$> gutter + +numberedStart :: P (Int, Token Word64) +numberedStart = listItemStart $ P.try (tokenP $ LP.decimal <* lit ".") + +-- | FIXME: This should take a @`P` a@ +numberedList :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) +numberedList code docClose = NumberedList <$> sepBy1' numberedItem listSep + where + numberedItem = P.label "numbered list (examples: 1. item1, 8. start numbering at '8')" do + (col, s) <- numberedStart + (s,) <$> column' code docClose col - listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) +-- | FIXME: This should take a @`P` a@ +bulletedList :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) +bulletedList code docClose = BulletedList <$> sepBy1' bullet listSep + where + bullet = P.label "bullet (examples: * item1, - item2)" do + (col, _) <- bulletedStart + column' code docClose col - bulletedStart = P.try $ do - r <- listItemStart' $ [] <$ P.satisfy bulletChar - P.lookAhead (P.satisfy isSpace) - pure r - where - bulletChar ch = ch == '*' || ch == '-' || ch == '+' - - listItemStart' :: P a -> P (Int, a) - listItemStart' gutter = P.try $ do - nonNewlineSpaces - col <- column <$> posP - parentCol <- S.gets parentListColumn - guard (col > parentCol) - (col,) <$> gutter - - numberedStart = - listItemStart' $ P.try (tokenP $ LP.decimal <* lit ".") - - listItemParagraph = fmap (wrap' . Paragraph) $ do - col <- column <$> posP - some' (leaf <* sep col) +column' :: (Annotated code) => (P () -> P code) -> P () -> Int -> P (Column (Tree code)) +column' code docClose col = + Column . wrap' + <$> (nonNewlineSpaces *> listItemParagraph) + <*> local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> fmap wrap' (list code docClose)) + where + listItemParagraph = + Paragraph <$> do + col <- column <$> posP + some' (leafy code docClose <* sep col) where -- Trickiness here to support hard line breaks inside of -- a bulleted list, so for instance this parses as expected: @@ -323,55 +406,48 @@ untitledSection code docClose = UntitledSection <$> P.many (sectionElem <* CP.sp (P.notFollowedBy $ void numberedStart <|> void bulletedStart) pure () - numberedItem = P.label msg $ do - (col, s) <- numberedStart - (s,) - <$> ( fmap (uncurry Column) $ do - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) - pure (p, subList) - ) - where - msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" +newline :: P String +newline = P.label "newline" $ lit "\n" <|> lit "\r\n" - bullet = fmap (uncurry Column) . P.label "bullet (examples: * item1, - item2)" $ do - (col, _) <- bulletedStart - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local - (\e -> e {parentListColumn = col}) - (P.optional $ listSep *> list) - pure (p, subList) - - newline = P.label "newline" $ lit "\n" <|> lit "\r\n" - - -- ## Section title - -- - -- A paragraph under this section. - -- Part of the same paragraph. Blanklines separate paragraphs. - -- - -- ### A subsection title - -- - -- A paragraph under this subsection. - - -- # A section title (not a subsection) - section :: P (Tree code) - section = fmap (wrap' . uncurry Section) $ do - ns <- S.gets parentSections - hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp - title <- paragraph <* CP.space - let m = length hashes + head ns - body <- - local (\env -> env {parentSections = (m : (tail ns))}) $ - P.many (sectionElem <* CP.space) - pure $ (title, body) - - wrap' :: Top code (Tree code) -> Tree code - wrap' doc = ann doc :< doc - - wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> Top code (Tree code)) -> a -> b -> Tree code - wrapSimple2 fn a b = ann a <> ann b :< fn a b +-- | +-- +-- > ## Section title +-- > +-- > A paragraph under this section. +-- > Part of the same paragraph. Blanklines separate paragraphs. +-- > +-- > ### A subsection title +-- > +-- > A paragraph under this subsection. +-- > +-- > # A section title (not a subsection) +section :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) +section code docClose = do + ns <- S.gets parentSections + hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp + title <- paragraph code docClose <* CP.space + let m = length hashes + head ns + body <- + local (\env -> env {parentSections = (m : (tail ns))}) $ + P.many (sectionElem code docClose <* CP.space) + pure $ Section (wrap' title) body + +-- | Not an actual node, but this pattern is referenced in multiple places +embedLink :: P EmbedLink +embedLink = embedTypeLink <|> embedTermLink + +-- | FIXME: This should just take a @`P` code@ and @`P` a@. +group :: P (NonEmpty (Leaf code a)) -> P (Leaf code a) +group = fmap Group . join + +-- | FIXME: This should just take a @`P` a@ +join :: P (NonEmpty a) -> P (Join a) +join = fmap Join + +-- * utility functions + +wrap' :: (Annotated code) => Top code (Tree code) -> Tree code +wrap' doc = ann doc :< doc -- | If it's a multi-line verbatim block we trim any whitespace representing -- indentation from the pretty-printer. @@ -425,7 +501,7 @@ trimIndentFromVerbatimBlock leadingSpaces txt = fromMaybe txt $ do then Just "" else Nothing --- Trim leading/trailing whitespace from around delimiters, e.g. +-- | Trim leading/trailing whitespace from around delimiters, e.g. -- -- {{ -- '''___ <- whitespace here including newline From a6528ac351c1b8f7e51363040d2f710e75a4a1ee Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Fri, 26 Jul 2024 22:56:22 -0600 Subject: [PATCH 29/50] Generalize the Doc parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is now completely[^1] independent of the Unison language. The parser takes a few parsers as arguments: one for identifiers, one for code, and one to indicate the end of the Doc block. [^1]: There is one last bit to be removed in the next commit – Doc still looks for `type` or `ability` to identify type links. --- .../src/Unison/Syntax/TermParser.hs | 16 +- unison-syntax/src/Unison/Parser/Ann.hs | 4 + unison-syntax/src/Unison/Syntax/Lexer.hs | 170 +------- .../src/Unison/Syntax/Lexer/Token.hs | 6 +- .../src/Unison/Syntax/Lexer/Unison.hs | 117 +++++- unison-syntax/src/Unison/Syntax/Parser.hs | 2 +- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 364 +++++++++++------- .../src/Unison/Syntax/Parser/Doc/Data.hs | 72 ++-- 8 files changed, 412 insertions(+), 339 deletions(-) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 4c3069b9ff..8d0195410a 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -530,7 +530,7 @@ doc2Block = do docUntitledSection ann (Doc.UntitledSection tops) = Term.app ann (f ann "UntitledSection") $ Term.list (gann tops) tops - docTop :: Doc.Top [L.Token L.Lexeme] (Term v Ann) -> TermP v m + docTop :: Doc.Top (HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m docTop d = case d of Doc.Section title body -> pure $ Term.apps' (f d "Section") [title, Term.list (gann body) body] Doc.Eval code -> @@ -558,7 +558,7 @@ doc2Block = do docColumn d@(Doc.Column para sublist) = Term.app (gann d) (f d "Column") . Term.list (gann d) $ para : toList sublist - docLeaf :: Doc.Leaf [L.Token L.Lexeme] (Term v Ann) -> TermP v m + docLeaf :: Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m docLeaf d = case d of Doc.Link link -> Term.app (gann d) (f d "Link") <$> docEmbedLink link Doc.NamedLink para target -> Term.apps' (f d "NamedLink") . (para :) . pure <$> docLeaf (vacuous target) @@ -590,7 +590,7 @@ doc2Block = do Term.app (gann d) (f d "Group") . Term.app (gann d) (f d "Join") . Term.list (ann leaves) . toList <$> traverse docLeaf leaves - docEmbedLink :: Doc.EmbedLink -> TermP v m + docEmbedLink :: Doc.EmbedLink (HQ'.HashQualified Name) -> TermP v m docEmbedLink d = case d of Doc.EmbedTypeLink ident -> Term.app (gann d) (f d "EmbedTypeLink") . Term.typeLink (ann d) . L.payload @@ -598,17 +598,21 @@ doc2Block = do Doc.EmbedTermLink ident -> Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docSourceElement :: Doc.SourceElement (Doc.Leaf [L.Token L.Lexeme] Void) -> TermP v m + docSourceElement :: + Doc.SourceElement (HQ'.HashQualified Name) (Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> + TermP v m docSourceElement d@(Doc.SourceElement link anns) = do link' <- docEmbedLink link anns' <- traverse docEmbedAnnotation anns pure $ Term.apps' (f d "SourceElement") [link', Term.list (ann anns) anns'] - docEmbedSignatureLink :: Doc.EmbedSignatureLink -> TermP v m + docEmbedSignatureLink :: Doc.EmbedSignatureLink (HQ'.HashQualified Name) -> TermP v m docEmbedSignatureLink d@(Doc.EmbedSignatureLink ident) = Term.app (gann d) (f d "EmbedSignatureLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docEmbedAnnotation :: Doc.EmbedAnnotation (Doc.Leaf [L.Token L.Lexeme] Void) -> TermP v m + docEmbedAnnotation :: + Doc.EmbedAnnotation (HQ'.HashQualified Name) (Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> + TermP v m docEmbedAnnotation d@(Doc.EmbedAnnotation a) = -- This is the only place I’m not sure we’re doing the right thing. In the lexer, this can be an identifier or a -- DocLeaf, but here it could be either /text/ or a Doc element. And I don’t think there’s any way the lexemes diff --git a/unison-syntax/src/Unison/Parser/Ann.hs b/unison-syntax/src/Unison/Parser/Ann.hs index 961bbcb30c..e4b361d148 100644 --- a/unison-syntax/src/Unison/Parser/Ann.hs +++ b/unison-syntax/src/Unison/Parser/Ann.hs @@ -4,6 +4,7 @@ module Unison.Parser.Ann where +import Control.Comonad.Cofree (Cofree ((:<))) import Data.List.NonEmpty (NonEmpty) import Data.Void (absurd) import Unison.Lexer.Pos qualified as L @@ -100,3 +101,6 @@ instance (Annotated a) => Annotated (Maybe a) where instance Annotated Void where ann = absurd + +instance (Annotated a) => Annotated (Cofree f a) where + ann (a :< _) = ann a diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index cfd932cd7e..5e6d18293f 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -1,12 +1,8 @@ -{-# LANGUAGE TemplateHaskell #-} -{-# OPTIONS_GHC -Wno-orphans #-} - -- | This currently contains a mix of general lexing utilities and identifier-y lexers. module Unison.Syntax.Lexer ( Token (..), Line, Column, - Err (..), Pos (..), touches, @@ -15,16 +11,10 @@ module Unison.Syntax.Lexer wordyIdStartChar, symbolyIdChar, - -- * new exports - BlockName, - Layout, - ParsingEnv (..), - P, + -- * other utils local, - parseFailure, space, lit, - err, commitAfter2, (<+>), some', @@ -32,99 +22,35 @@ module Unison.Syntax.Lexer sepBy1', separated, wordySep, - identifierP, - wordyIdSegP, - shortHashP, - topBlockName, pop, typeOrAbilityAlt, - typeModifiersAlt, inc, ) where -import Control.Comonad.Cofree (Cofree ((:<))) import Control.Monad.State qualified as S import Data.Char (isSpace) import Data.List.NonEmpty (NonEmpty ((:|))) -import Data.Text qualified as Text import Text.Megaparsec qualified as P import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP -import Text.Megaparsec.Error qualified as EP -import Text.Megaparsec.Internal qualified as PI -import Unison.HashQualifiedPrime qualified as HQ' import Unison.Lexer.Pos (Column, Line, Pos (Pos), column, line) -import Unison.Name (Name) -import Unison.NameSegment (NameSegment) -import Unison.Parser.Ann (Annotated (..)) import Unison.Prelude -import Unison.ShortHash (ShortHash) -import Unison.Syntax.Lexer.Token (Token (..), posP) -import Unison.Syntax.Name qualified as Name (nameP) +import Unison.Syntax.Lexer.Token (Token (..)) import Unison.Syntax.NameSegment (symbolyIdChar, wordyIdChar, wordyIdStartChar) -import Unison.Syntax.NameSegment qualified as NameSegment (ParseErr (..), wordyP) -import Unison.Syntax.ReservedWords (typeModifiers, typeOrAbility) -import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) - -instance (Annotated a) => Annotated (Cofree f a) where - ann (a :< _) = ann a - -type BlockName = String - -type Layout = [(BlockName, Column)] - -data ParsingEnv = ParsingEnv - { -- | layout stack - layout :: !Layout, - -- | `Just b` if a block of type `b` is being opened - opening :: Maybe BlockName, - -- | are we inside a construct that uses layout? - inLayout :: Bool, - -- | Use a stack to remember the parent section and allow docSections within docSections. - -- - 1 means we are inside a # Heading 1 - parentSections :: [Int], - -- | 4 means we are inside a list starting at the fourth column - parentListColumn :: Int - } - deriving (Show) - -type P = P.ParsecT (Token Err) String (S.State ParsingEnv) - -local :: (ParsingEnv -> ParsingEnv) -> P a -> P a +import Unison.Syntax.ReservedWords (typeOrAbility) + +local :: (P.MonadParsec e s' m, S.MonadState s m) => (s -> s) -> m a -> m a local f p = do env0 <- S.get S.put (f env0) e <- P.observing p S.put env0 case e of - Left e -> parseFailure e + Left e -> P.parseError e Right a -> pure a -parseFailure :: EP.ParseError [Char] (Token Err) -> P a -parseFailure e = PI.ParsecT $ \s _ _ _ eerr -> eerr e s - -data Err - = ReservedWordyId String - | InvalidSymbolyId String - | ReservedSymbolyId String - | InvalidShortHash String - | InvalidBytesLiteral String - | InvalidHexLiteral - | InvalidOctalLiteral - | Both Err Err - | MissingFractional String -- ex `1.` rather than `1.04` - | MissingExponent String -- ex `1e` rather than `1e3` - | UnknownLexeme - | TextLiteralMissingClosingQuote String - | InvalidEscapeCharacter Char - | LayoutError - | CloseWithoutMatchingOpen String String -- open, close - | UnexpectedDelimiter String - | UnexpectedTokens String -- Catch-all for all other lexer errors, representing some unexpected tokens. - deriving stock (Eq, Ord, Show) -- richer algebra - -space :: P () +space :: (P.MonadParsec e String m) => m () space = LP.space CP.space1 @@ -133,92 +59,42 @@ space = where fold = P.try $ lit "---" *> P.takeRest *> pure () -lit :: String -> P String +lit :: (P.MonadParsec e String m) => String -> m String lit = P.try . LP.symbol (pure ()) --- Committed failure -err :: Pos -> Err -> P x -err start t = do - stop <- posP - -- This consumes a character and therefore produces committed failure, - -- so `err s t <|> p2` won't try `p2` - _ <- void P.anySingle <|> P.eof - P.customFailure (Token t start stop) - -{- -commitAfter :: P a -> (a -> P b) -> P b -commitAfter a f = do - a <- P.try a - f a --} - -commitAfter2 :: P a -> P b -> (a -> b -> P c) -> P c +commitAfter2 :: (P.MonadParsec e s m) => m a -> m b -> (a -> b -> m c) -> m c commitAfter2 a b f = do (a, b) <- P.try $ liftA2 (,) a b f a b infixl 2 <+> -(<+>) :: (Monoid a) => P a -> P a -> P a -p1 <+> p2 = do a1 <- p1; a2 <- p2; pure (a1 <> a2) +(<+>) :: (Applicative f, Monoid a) => f a -> f a -> f a +(<+>) = liftA2 (<>) -- | Like `P.some`, but returns an actual `NonEmpty`. -some' :: P a -> P (NonEmpty a) +some' :: (P.MonadParsec e s m) => m a -> m (NonEmpty a) some' p = liftA2 (:|) p $ many p -- | Like `P.someTill`, but returns an actual `NonEmpty`. -someTill' :: P a -> P end -> P (NonEmpty a) +someTill' :: (P.MonadParsec e s m) => m a -> m end -> m (NonEmpty a) someTill' p end = liftA2 (:|) p $ P.manyTill p end -- | Like `P.sepBy1`, but returns an actual `NonEmpty`. -sepBy1' :: P a -> P sep -> P (NonEmpty a) +sepBy1' :: (P.MonadParsec e s m) => m a -> m sep -> m (NonEmpty a) sepBy1' p sep = liftA2 (:|) p . many $ sep *> p -separated :: (Char -> Bool) -> P a -> P a +separated :: (P.MonadParsec e s m) => (P.Token s -> Bool) -> m a -> m a separated ok p = P.try $ p <* P.lookAhead (void (P.satisfy ok) <|> P.eof) wordySep :: Char -> Bool wordySep c = isSpace c || not (wordyIdChar c) --- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is --- symboly (comprised of only symbols) or wordy (comprised of only alphanums). --- --- Examples: --- --- foo --- .foo.++.doc --- `.`.`..` (This is a two-segment identifier without a leading dot: "." then "..") -identifierP :: P (HQ'.HashQualified Name) -identifierP = do - P.label "identifier (ex: abba1, snake_case, .foo.bar#xyz, .foo.++#xyz, or 🌻)" do - name <- PI.withParsecT (fmap nameSegmentParseErrToErr) Name.nameP - P.optional shortHashP <&> \case - Nothing -> HQ'.fromName name - Just shorthash -> HQ'.HashQualified name shorthash - where - nameSegmentParseErrToErr :: NameSegment.ParseErr -> Err - nameSegmentParseErrToErr = \case - NameSegment.ReservedOperator s -> ReservedSymbolyId (Text.unpack s) - NameSegment.ReservedWord s -> ReservedWordyId (Text.unpack s) - -wordyIdSegP :: P NameSegment -wordyIdSegP = - PI.withParsecT (fmap (ReservedWordyId . Text.unpack)) NameSegment.wordyP - -shortHashP :: P ShortHash -shortHashP = - PI.withParsecT (fmap (InvalidShortHash . Text.unpack)) ShortHash.shortHashP - -- `True` if the tokens are adjacent, with no space separating the two touches :: Token a -> Token b -> Bool touches (end -> t) (start -> t2) = line t == line t2 && column t == column t2 --- todo: make Layout a NonEmpty -topBlockName :: Layout -> Maybe BlockName -topBlockName [] = Nothing -topBlockName ((name, _) : _) = Just name - pop :: [a] -> [a] pop = drop 1 @@ -226,21 +102,5 @@ typeOrAbilityAlt :: (Alternative f) => (Text -> f a) -> f a typeOrAbilityAlt f = asum $ map f (toList typeOrAbility) -typeModifiersAlt :: (Alternative f) => (Text -> f a) -> f a -typeModifiersAlt f = - asum $ map f (toList typeModifiers) - inc :: Pos -> Pos inc (Pos line col) = Pos line (col + 1) - -instance EP.ShowErrorComponent (Token Err) where - showErrorComponent (Token err _ _) = go err - where - go = \case - UnexpectedTokens msg -> msg - CloseWithoutMatchingOpen open close -> "I found a closing " <> close <> " but no matching " <> open <> "." - Both e1 e2 -> go e1 <> "\n" <> go e2 - LayoutError -> "Indentation error" - TextLiteralMissingClosingQuote s -> "This text literal missing a closing quote: " <> excerpt s - e -> show e - excerpt s = if length s < 15 then s else take 15 s <> "..." diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Token.hs b/unison-syntax/src/Unison/Syntax/Lexer/Token.hs index e29f276c5e..f778dd66c0 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Token.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Token.hs @@ -6,7 +6,7 @@ module Unison.Syntax.Lexer.Token where import Data.Text qualified as Text -import Text.Megaparsec (ParsecT, TraversableStream) +import Text.Megaparsec (MonadParsec, TraversableStream) import Text.Megaparsec qualified as P import Unison.Lexer.Pos (Pos (Pos)) import Unison.Parser.Ann (Ann (Ann), Annotated (..)) @@ -43,14 +43,14 @@ instance Applicative Token where instance P.ShowErrorComponent (Token Text) where showErrorComponent = Text.unpack . payload -tokenP :: (Ord e, TraversableStream s) => ParsecT e s m a -> ParsecT e s m (Token a) +tokenP :: (Ord e, TraversableStream s, MonadParsec e s m) => m a -> m (Token a) tokenP p = do start <- posP payload <- p end <- posP pure Token {payload, start, end} -posP :: (Ord e, TraversableStream s) => ParsecT e s m Pos +posP :: (Ord e, TraversableStream s, MonadParsec e s m) => m Pos posP = do p <- P.getSourcePos pure (Pos (P.unPos (P.sourceLine p)) (P.unPos (P.sourceColumn p))) diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs index 86e75b1afe..98112c2124 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs @@ -1,6 +1,3 @@ -{-# LANGUAGE TemplateHaskell #-} -{-# OPTIONS_GHC -Wno-orphans #-} - module Unison.Syntax.Lexer.Unison ( Token (..), Line, @@ -44,9 +41,11 @@ import Text.Megaparsec.Char (char) import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Text.Megaparsec.Error qualified as EP +import Text.Megaparsec.Internal qualified as PI import Unison.HashQualifiedPrime qualified as HQ' import Unison.Name (Name) import Unison.Name qualified as Name +import Unison.NameSegment (NameSegment) import Unison.NameSegment qualified as NameSegment (docSegment) import Unison.NameSegment.Internal qualified as NameSegment import Unison.Prelude @@ -55,13 +54,51 @@ import Unison.ShortHash qualified as SH import Unison.Syntax.HashQualifiedPrime qualified as HQ' (toText) import Unison.Syntax.Lexer import Unison.Syntax.Lexer.Token (posP, tokenP) -import Unison.Syntax.Name qualified as Name (isSymboly, toText, unsafeParseText) +import Unison.Syntax.Name qualified as Name (isSymboly, nameP, toText, unsafeParseText) +import Unison.Syntax.NameSegment qualified as NameSegment (ParseErr (..), wordyP) import Unison.Syntax.Parser.Doc qualified as Doc import Unison.Syntax.Parser.Doc.Data qualified as Doc import Unison.Syntax.ReservedWords (delimiters, typeModifiers, typeOrAbility) +import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) import Unison.Util.Bytes qualified as Bytes import Unison.Util.Monoid (intercalateMap) +type BlockName = String + +type Layout = [(BlockName, Column)] + +data ParsingEnv = ParsingEnv + { -- | layout stack + layout :: !Layout, + -- | `Just b` if a block of type `b` is being opened + opening :: Maybe BlockName, + -- | are we inside a construct that uses layout? + inLayout :: Bool + } + deriving (Show) + +type P = P.ParsecT (Token Err) String (S.State ParsingEnv) + +data Err + = ReservedWordyId String + | InvalidSymbolyId String + | ReservedSymbolyId String + | InvalidShortHash String + | InvalidBytesLiteral String + | InvalidHexLiteral + | InvalidOctalLiteral + | Both Err Err + | MissingFractional String -- ex `1.` rather than `1.04` + | MissingExponent String -- ex `1e` rather than `1e3` + | UnknownLexeme + | TextLiteralMissingClosingQuote String + | InvalidEscapeCharacter Char + | LayoutError + | CloseWithoutMatchingOpen String String -- open, close + | UnexpectedDelimiter String + | UnexpectedTokens String -- Catch-all for all other lexer errors, representing some unexpected tokens. + deriving stock (Eq, Ord, Show) -- richer algebra + -- Design principle: -- `[Lexeme]` should be sufficient information for parsing without -- further knowledge of spacing or indentation levels @@ -80,11 +117,20 @@ data Lexeme | Bytes Bytes.Bytes -- bytes literals | Hash ShortHash -- hash literals | Err Err - | Doc (Doc.UntitledSection (Doc.Tree [Token Lexeme])) + | Doc (Doc.UntitledSection (Doc.Tree (HQ'.HashQualified Name) [Token Lexeme])) deriving stock (Eq, Show, Ord) type IsVirtual = Bool -- is it a virtual semi or an actual semi? +-- Committed failure +err :: (P.TraversableStream s, P.MonadParsec (Token Err) s m) => Pos -> Err -> m x +err start t = do + stop <- posP + -- This consumes a character and therefore produces committed failure, + -- so `err s t <|> p2` won't try `p2` + _ <- void P.anySingle <|> P.eof + P.customFailure (Token t start stop) + token :: P Lexeme -> P [Token Lexeme] token = token' (\a start end -> [Token a start end]) @@ -230,7 +276,7 @@ lexer scope rem = (P.EndOfInput) -> "end of input" customErrs es = [Err <$> e | P.ErrorCustom e <- toList es] toPos (P.SourcePos _ line col) = Pos (P.unPos line) (P.unPos col) - env0 = ParsingEnv [] (Just scope) True [0] 0 + env0 = ParsingEnv [] (Just scope) True -- | hacky postprocessing pass to do some cleanup of stuff that's annoying to -- fix without adding more state to the lexer: @@ -306,14 +352,9 @@ doc2 = do env0 <- S.get -- Disable layout while parsing the doc block and reset the section number (docTok, closeTok) <- local - ( \env -> - env - { inLayout = False, - parentSections = 0 : (parentSections env0) - } - ) + (\env -> env {inLayout = False}) do - body <- Doc.untitledSection . Doc.sectionElem lexemes' . P.lookAhead $ () <$ lit "}}" + body <- Doc.doc identifierP lexemes' . P.lookAhead $ () <$ lit "}}" closeStart <- posP lit "}}" closeEnd <- posP @@ -674,6 +715,27 @@ tok p = do token <- tokenP p pure [token] +-- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is +-- symboly (comprised of only symbols) or wordy (comprised of only alphanums). +-- +-- Examples: +-- +-- foo +-- .foo.++.doc +-- `.`.`..` (This is a two-segment identifier without a leading dot: "." then "..") +identifierP :: (Monad m) => P.ParsecT (Token Err) String m (HQ'.HashQualified Name) +identifierP = do + P.label "identifier (ex: abba1, snake_case, .foo.bar#xyz, .foo.++#xyz, or 🌻)" do + name <- PI.withParsecT (fmap nameSegmentParseErrToErr) Name.nameP + P.optional shortHashP <&> \case + Nothing -> HQ'.fromName name + Just shorthash -> HQ'.HashQualified name shorthash + where + nameSegmentParseErrToErr :: NameSegment.ParseErr -> Err + nameSegmentParseErrToErr = \case + NameSegment.ReservedOperator s -> ReservedSymbolyId (Text.unpack s) + NameSegment.ReservedWord s -> ReservedWordyId (Text.unpack s) + -- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is -- symboly (comprised of only symbols) or wordy (comprised of only alphanums). -- @@ -691,6 +753,14 @@ identifierLexeme name = then SymbolyId name else WordyId name +wordyIdSegP :: P.ParsecT (Token Err) String m NameSegment +wordyIdSegP = + PI.withParsecT (fmap (ReservedWordyId . Text.unpack)) NameSegment.wordyP + +shortHashP :: P.ParsecT (Token Err) String m ShortHash +shortHashP = + PI.withParsecT (fmap (InvalidShortHash . Text.unpack)) ShortHash.shortHashP + blockDelimiter :: [String] -> P String -> P [Token Lexeme] blockDelimiter open closeP = do Token close pos1 pos2 <- tokenP closeP @@ -739,6 +809,11 @@ top :: Layout -> Column top [] = 1 top ((_, h) : _) = h +-- todo: make Layout a NonEmpty +topBlockName :: Layout -> Maybe BlockName +topBlockName [] = Nothing +topBlockName ((name, _) : _) = Just name + topLeftCorner :: Pos topLeftCorner = Pos 1 1 @@ -855,6 +930,10 @@ showEscapeChar :: Char -> Maybe Char showEscapeChar c = Map.lookup c (Map.fromList [(x, y) | (y, x) <- escapeChars]) +typeModifiersAlt :: (Alternative f) => (Text -> f a) -> f a +typeModifiersAlt f = + asum $ map f (toList typeModifiers) + debugFilePreParse :: FilePath -> IO () debugFilePreParse file = putStrLn . debugPreParse . preParse . lexer file . Text.unpack =<< readUtf8 file @@ -877,6 +956,18 @@ debugPreParse ts = show $ payload <$> ts debugPreParse' :: String -> String debugPreParse' = debugPreParse . preParse . lexer "debugPreParse" +instance EP.ShowErrorComponent (Token Err) where + showErrorComponent (Token err _ _) = go err + where + go = \case + UnexpectedTokens msg -> msg + CloseWithoutMatchingOpen open close -> "I found a closing " <> close <> " but no matching " <> open <> "." + Both e1 e2 -> go e1 <> "\n" <> go e2 + LayoutError -> "Indentation error" + TextLiteralMissingClosingQuote s -> "This text literal missing a closing quote: " <> excerpt s + e -> show e + excerpt s = if length s < 15 then s else take 15 s <> "..." + instance P.VisualStream [Token Lexeme] where showTokens _ xs = join . Nel.toList . S.evalState (traverse go xs) . end $ Nel.head xs diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 098caab1b6..fac55142de 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -401,7 +401,7 @@ string = queryToken getString getString (L.Textual s) = Just (Text.pack s) getString _ = Nothing -doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection (Doc.Tree [L.Token L.Lexeme]))) +doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection (Doc.Tree (HQ'.HashQualified Name) [L.Token L.Lexeme]))) doc = queryToken \case L.Doc d -> pure d _ -> Nothing diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs index 99122bd5ff..4009c30dec 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -1,5 +1,17 @@ +-- | The parser for Unison’s @Doc@ syntax. +-- +-- This is completely independent of the Unison language, and requires a couple parsers to be passed in to then +-- provide a parser for @Doc@ applied to any host language. +-- +-- - an identifer parser +-- - a code parser (that accepts a termination parser) +-- - a termination parser (only used for lookahead), for this parser to know when to give up +-- +-- Each of those parsers is expected to satisfy @(`Ord` e, `P.MonadParsec` e `String` m)@. module Unison.Syntax.Parser.Doc ( Tree, + initialState, + doc, untitledSection, sectionElem, leaf, @@ -55,10 +67,7 @@ import Text.Megaparsec.Char.Lexer qualified as LP import Unison.Parser.Ann (Ann, Annotated (..)) import Unison.Prelude hiding (join) import Unison.Syntax.Lexer - ( P, - ParsingEnv (..), - column, - identifierP, + ( column, line, lit, local, @@ -73,28 +82,58 @@ import Unison.Syntax.Lexer import Unison.Syntax.Lexer.Token (Token (Token), posP, tokenP) import Unison.Syntax.Parser.Doc.Data -type Tree code = Cofree (Top code) Ann +type Tree ident code = Cofree (Top ident code) Ann + +data ParsingEnv = ParsingEnv + { -- | Use a stack to remember the parent section and allow docSections within docSections. + -- - 1 means we are inside a # Heading 1 + parentSections :: [Int], + -- | 4 means we are inside a list starting at the fourth column + parentListColumn :: Int + } + deriving (Show) + +initialState :: ParsingEnv +initialState = ParsingEnv [0] 0 + +doc :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (UntitledSection (Tree ident code)) +doc ident code = flip S.evalStateT initialState . untitledSection . sectionElem ident code -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). -untitledSection :: P a -> P (UntitledSection a) +untitledSection :: (P.MonadParsec e String m) => m a -> m (UntitledSection a) untitledSection a = UntitledSection <$> P.many (a <* CP.space) -wordyKw :: String -> P String +wordyKw :: (P.MonadParsec e String m) => String -> m String wordyKw kw = separated wordySep (lit kw) -sectionElem :: (Annotated code) => (P () -> P code) -> P () -> P (Tree code) -sectionElem code docClose = +sectionElem :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + S.StateT ParsingEnv m (Tree ident code) +sectionElem ident code docClose = fmap wrap' $ - section code docClose - <|> P.label "block eval (syntax: a fenced code block)" (eval code <|> exampleBlock code <|> codeBlock) - <|> list code docClose - <|> paragraph code docClose - -paragraph :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) -paragraph code = fmap Paragraph . spaced . leafy code - -word :: P end -> P (Leaf code void) + section ident code docClose + <|> lift (P.label "block eval (syntax: a fenced code block)" (eval code <|> exampleBlock code <|> codeBlock)) + <|> list ident code docClose + <|> lift (paragraph ident code docClose) + +paragraph :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Top ident code (Tree ident code)) +paragraph ident code = fmap Paragraph . spaced . leafy ident code + +word :: (Ord e, P.MonadParsec e String m) => m end -> m (Leaf ident code void) word closing = fmap Word . tokenP . P.try $ do let end = P.lookAhead $ void (P.satisfy isSpace) <|> void closing word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end @@ -103,43 +142,56 @@ word closing = fmap Word . tokenP . P.try $ do where reserved word = List.isPrefixOf "}}" word || all (== '#') word -leaf :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -leaf code closing = - do - link - <|> namedLink code closing - <|> example code - <|> transclude code - <|> bold code closing - <|> italic code closing - <|> strikethrough code closing +leaf :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +leaf ident code closing = + link ident + <|> namedLink ident code closing + <|> example code + <|> transclude code + <|> bold ident code closing + <|> italic ident code closing + <|> strikethrough ident code closing <|> verbatim - <|> source code - <|> foldedSource code + <|> source ident code + <|> foldedSource ident code <|> evalInline code - <|> signatures - <|> signatureInline + <|> signatures ident + <|> signatureInline ident <|> word closing -leafy :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -leafy code closing = do - p <- leaf code closing - after <- P.optional . P.try $ leafy code closing +leafy :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +leafy ident code closing = do + p <- leaf ident code closing + after <- P.optional . P.try $ leafy ident code closing case after of Nothing -> pure p Just after -> group . pure $ p :| pure after -comma :: P String +comma :: (P.MonadParsec e String m) => m String comma = lit "," <* CP.space -source :: (P () -> P code) -> P (Leaf code a) -source = fmap Source . (lit "@source" *>) . sourceElements +source :: (Ord e, P.MonadParsec e String m) => m ident -> (m () -> m code) -> m (Leaf ident code a) +source ident = fmap Source . (lit "@source" *>) . sourceElements ident -foldedSource :: (P () -> P code) -> P (Leaf code a) -foldedSource = fmap FoldedSource . (lit "@foldedSource" *>) . sourceElements +foldedSource :: (Ord e, P.MonadParsec e String m) => m ident -> (m () -> m code) -> m (Leaf ident code a) +foldedSource ident = fmap FoldedSource . (lit "@foldedSource" *>) . sourceElements ident -sourceElements :: (P () -> P code) -> P (NonEmpty (SourceElement (Leaf code Void))) -sourceElements code = do +sourceElements :: + (Ord e, P.MonadParsec e String m) => + m ident -> + (m () -> m code) -> + m (NonEmpty (SourceElement ident (Leaf ident code Void))) +sourceElements ident code = do _ <- (lit " {" <|> lit "{") *> CP.space s <- sepBy1' srcElem comma _ <- lit "}" @@ -147,49 +199,48 @@ sourceElements code = do where srcElem = SourceElement - <$> embedLink + <$> embedLink ident <*> ( fmap (fromMaybe []) . P.optional $ (lit "@") *> (CP.space *> annotations) ) where - annotation = fmap Left (tokenP identifierP) <|> fmap Right (transclude code) <* CP.space - annotations = - P.some (EmbedAnnotation <$> annotation) + annotation = fmap Left (tokenP ident) <|> fmap Right (transclude code) <* CP.space + annotations = P.some (EmbedAnnotation <$> annotation) -signatures :: P (Leaf code a) -signatures = fmap Signature $ do +signatures :: (Ord e, P.MonadParsec e String m) => m ident -> m (Leaf ident code a) +signatures ident = fmap Signature $ do _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space - s <- sepBy1' embedSignatureLink comma + s <- sepBy1' (embedSignatureLink ident) comma _ <- lit "}" pure s -signatureInline :: P (Leaf code a) -signatureInline = fmap SignatureInline $ do +signatureInline :: (Ord e, P.MonadParsec e String m) => m ident -> m (Leaf ident code a) +signatureInline ident = fmap SignatureInline $ do _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space - s <- embedSignatureLink + s <- embedSignatureLink ident _ <- lit "}" pure s -evalInline :: (P () -> P a1) -> P (Leaf a1 a2) +evalInline :: (P.MonadParsec e String m) => (m () -> m code) -> m (Leaf ident code a) evalInline code = fmap EvalInline $ do _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space let inlineEvalClose = void $ lit "}" s <- code inlineEvalClose pure s -embedTypeLink :: P EmbedLink -embedTypeLink = +embedTypeLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) +embedTypeLink ident = EmbedTypeLink <$> do _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tokenP identifierP <* CP.space + tokenP ident <* CP.space -embedTermLink :: P EmbedLink -embedTermLink = EmbedTermLink <$> tokenP identifierP <* CP.space +embedTermLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) +embedTermLink ident = EmbedTermLink <$> tokenP ident <* CP.space -embedSignatureLink :: P EmbedSignatureLink -embedSignatureLink = EmbedSignatureLink <$> tokenP identifierP <* CP.space +embedSignatureLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedSignatureLink ident) +embedSignatureLink ident = EmbedSignatureLink <$> tokenP ident <* CP.space -verbatim :: P (Leaf code a) +verbatim :: (Ord e, P.MonadParsec e String m) => m (Leaf ident code a) verbatim = P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do Token originalText start stop <- tokenP do @@ -199,21 +250,17 @@ verbatim = quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) P.someTill P.anySingle (lit quotes) let isMultiLine = line start /= line stop - if isMultiLine - then do - let trimmed = (trimAroundDelimiters originalText) - let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed - -- If it's a multi-line verbatim block we trim any whitespace representing - -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - pure . Verbatim $ - Word $ - Token txt start stop - else - pure . Code $ - Word $ - Token originalText start stop - -example :: (P () -> P code) -> P (Leaf code void) + pure + if isMultiLine + then + let trimmed = (trimAroundDelimiters originalText) + txt = trimIndentFromVerbatimBlock (column start - 1) trimmed + in -- If it's a multi-line verbatim block we trim any whitespace representing + -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' + Verbatim . Word $ Token txt start stop + else Code . Word $ Token originalText start stop + +example :: (P.MonadParsec e String m) => (m () -> m code) -> m (Leaf ident code void) example code = P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ fmap Example $ do @@ -223,20 +270,20 @@ example code = let end = void . lit $ replicate (n + 1) '`' CP.space *> code end -link :: P (Leaf a b) -link = P.label "link (examples: {type List}, {Nat.+})" $ Link <$> P.try (lit "{" *> embedLink <* lit "}") +link :: (Ord e, P.MonadParsec e String m) => m ident -> m (Leaf ident code a) +link ident = P.label "link (examples: {type List}, {Nat.+})" $ Link <$> P.try (lit "{" *> embedLink ident <* lit "}") -transclude :: (P () -> P code) -> P (Leaf code x) +transclude :: (P.MonadParsec e String m) => (m () -> m code) -> m (Leaf ident code a) transclude code = fmap Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ lit "{{" *> code (void $ lit "}}") -nonNewlineSpaces :: P String +nonNewlineSpaces :: (P.MonadParsec e String m) => m String nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace where nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' -eval :: (Annotated code) => (P () -> P code) -> P (Top code (Tree code)) +eval :: (P.MonadParsec e String m, Annotated code) => (m () -> m code) -> m (Top ident code (Tree ident code)) eval code = Eval <$> do -- commit after seeing that ``` is on its own line @@ -246,7 +293,7 @@ eval code = fence <$ guard b CP.space *> code (void $ lit fence) -exampleBlock :: (Annotated code) => (P () -> P code) -> P (Top code (Tree code)) +exampleBlock :: (P.MonadParsec e String m, Annotated code) => (m () -> m code) -> m (Top ident code (Tree ident code)) exampleBlock code = ExampleBlock <$> do @@ -254,7 +301,7 @@ exampleBlock code = fence <- lit "```" <+> P.takeWhileP Nothing (== '`') code . void $ lit fence -codeBlock :: P (Top code (Tree code)) +codeBlock :: (Ord e, P.MonadParsec e String m) => m (Top ident code (Tree ident code)) codeBlock = do column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel let tabWidth = toInteger . P.unPos $ P.defaultTabWidth @@ -280,8 +327,14 @@ codeBlock = do skip _ s = s in List.intercalate "\n" $ skip column <$> lines s -emphasis :: (Annotated code) => Char -> (P () -> P code) -> P () -> P (Tree code) -emphasis delimiter code closing = do +emphasis :: + (Ord e, P.MonadParsec e String m, Annotated code) => + Char -> + m ident -> + (m () -> m code) -> + m () -> + m (Tree ident code) +emphasis delimiter ident code closing = do let start = some (P.satisfy (== delimiter)) end <- P.try $ do end <- start @@ -289,38 +342,57 @@ emphasis delimiter code closing = do pure end wrap' . Paragraph <$> someTill' - (leafy code (closing <|> (void $ lit end)) <* void whitespaceWithoutParagraphBreak) + (leafy ident code (closing <|> (void $ lit end)) <* void whitespaceWithoutParagraphBreak) (lit end) where - -- Allows whitespace or a newline, but not more than two newlines in a row. - whitespaceWithoutParagraphBreak :: P () + -- Allows whitespace including up to one newline whitespaceWithoutParagraphBreak = void do void nonNewlineSpaces optional newline >>= \case Just _ -> void nonNewlineSpaces Nothing -> pure () -bold :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -bold code = fmap Bold . emphasis '*' code - -italic :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -italic code = fmap Italic . emphasis '_' code - -strikethrough :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -strikethrough code = fmap Strikethrough . emphasis '~' code - -namedLink :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -namedLink code docClose = +bold :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +bold ident code = fmap Bold . emphasis '*' ident code + +italic :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +italic ident code = fmap Italic . emphasis '_' ident code + +strikethrough :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +strikethrough ident code = fmap Strikethrough . emphasis '~' ident code + +namedLink :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +namedLink ident code docClose = P.label "hyperlink (example: [link name](https://destination.com))" do _ <- lit "[" - p <- spaced . leafy code . void $ char ']' + p <- spaced . leafy ident code . void $ char ']' _ <- lit "]" _ <- lit "(" - target <- group $ fmap pure link <|> some' (transclude code <|> word (docClose <|> void (char ')'))) + target <- group $ fmap pure (link ident) <|> some' (transclude code <|> word (docClose <|> void (char ')'))) _ <- lit ")" pure $ NamedLink (wrap' $ Paragraph p) target -sp :: P String +sp :: (P.MonadParsec e String m) => m String sp = P.try $ do spaces <- P.takeWhile1P (Just "space") isSpace close <- P.optional (P.lookAhead (lit "}}")) @@ -331,17 +403,22 @@ sp = P.try $ do where ok s = length [() | '\n' <- s] < 2 -spaced :: P a -> P (NonEmpty a) +spaced :: (P.MonadParsec e String m) => m a -> m (NonEmpty a) spaced p = some' (p <* P.optional sp) -- | Not an actual node, but this pattern is referenced in multiple places -list :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) -list code docClose = bulletedList code docClose <|> numberedList code docClose - -listSep :: P () +list :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + S.StateT ParsingEnv m (Top ident code (Tree ident code)) +list ident code docClose = bulletedList ident code docClose <|> numberedList ident code docClose + +listSep :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m () listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) -bulletedStart :: P (Int, [a]) +bulletedStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m (Int, [a]) bulletedStart = P.try $ do r <- listItemStart $ [] <$ P.satisfy bulletChar P.lookAhead (P.satisfy isSpace) @@ -349,43 +426,59 @@ bulletedStart = P.try $ do where bulletChar ch = ch == '*' || ch == '-' || ch == '+' -listItemStart :: P a -> P (Int, a) -listItemStart gutter = P.try $ do +listItemStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m a -> m (Int, a) +listItemStart gutter = P.try do nonNewlineSpaces col <- column <$> posP parentCol <- S.gets parentListColumn guard (col > parentCol) (col,) <$> gutter -numberedStart :: P (Int, Token Word64) +numberedStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m (Int, Token Word64) numberedStart = listItemStart $ P.try (tokenP $ LP.decimal <* lit ".") -- | FIXME: This should take a @`P` a@ -numberedList :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) -numberedList code docClose = NumberedList <$> sepBy1' numberedItem listSep +numberedList :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + S.StateT ParsingEnv m (Top ident code (Tree ident code)) +numberedList ident code docClose = NumberedList <$> sepBy1' numberedItem listSep where numberedItem = P.label "numbered list (examples: 1. item1, 8. start numbering at '8')" do (col, s) <- numberedStart - (s,) <$> column' code docClose col + (s,) <$> column' ident code docClose col -- | FIXME: This should take a @`P` a@ -bulletedList :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) -bulletedList code docClose = BulletedList <$> sepBy1' bullet listSep +bulletedList :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + S.StateT ParsingEnv m (Top ident code (Tree ident code)) +bulletedList ident code docClose = BulletedList <$> sepBy1' bullet listSep where bullet = P.label "bullet (examples: * item1, - item2)" do (col, _) <- bulletedStart - column' code docClose col - -column' :: (Annotated code) => (P () -> P code) -> P () -> Int -> P (Column (Tree code)) -column' code docClose col = + column' ident code docClose col + +column' :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + Int -> + S.StateT ParsingEnv m (Column (Tree ident code)) +column' ident code docClose col = Column . wrap' <$> (nonNewlineSpaces *> listItemParagraph) - <*> local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> fmap wrap' (list code docClose)) + <*> local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> fmap wrap' (list ident code docClose)) where listItemParagraph = Paragraph <$> do col <- column <$> posP - some' (leafy code docClose <* sep col) + some' (lift (leafy ident code docClose) <* sep col) where -- Trickiness here to support hard line breaks inside of -- a bulleted list, so for instance this parses as expected: @@ -406,7 +499,7 @@ column' code docClose col = (P.notFollowedBy $ void numberedStart <|> void bulletedStart) pure () -newline :: P String +newline :: (P.MonadParsec e String m) => m String newline = P.label "newline" $ lit "\n" <|> lit "\r\n" -- | @@ -421,32 +514,37 @@ newline = P.label "newline" $ lit "\n" <|> lit "\r\n" -- > A paragraph under this subsection. -- > -- > # A section title (not a subsection) -section :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) -section code docClose = do +section :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + S.StateT ParsingEnv m (Top ident code (Tree ident code)) +section ident code docClose = do ns <- S.gets parentSections hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp - title <- paragraph code docClose <* CP.space + title <- lift $ paragraph ident code docClose <* CP.space let m = length hashes + head ns body <- local (\env -> env {parentSections = (m : (tail ns))}) $ - P.many (sectionElem code docClose <* CP.space) + P.many (sectionElem ident code docClose <* CP.space) pure $ Section (wrap' title) body -- | Not an actual node, but this pattern is referenced in multiple places -embedLink :: P EmbedLink -embedLink = embedTypeLink <|> embedTermLink +embedLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) +embedLink ident = embedTypeLink ident <|> embedTermLink ident -- | FIXME: This should just take a @`P` code@ and @`P` a@. -group :: P (NonEmpty (Leaf code a)) -> P (Leaf code a) +group :: (P.MonadParsec e s m) => m (NonEmpty (Leaf ident code a)) -> m (Leaf ident code a) group = fmap Group . join -- | FIXME: This should just take a @`P` a@ -join :: P (NonEmpty a) -> P (Join a) +join :: (P.MonadParsec e s m) => m (NonEmpty a) -> m (Join a) join = fmap Join -- * utility functions -wrap' :: (Annotated code) => Top code (Tree code) -> Tree code +wrap' :: (Annotated code) => Top ident code (Tree ident code) -> Tree ident code wrap' doc = ann doc :< doc -- | If it's a multi-line verbatim block we trim any whitespace representing diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs index 5167b2bcf6..56a14939b6 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs @@ -1,7 +1,11 @@ {-# LANGUAGE TemplateHaskell #-} +{-# OPTIONS_GHC -Wno-orphans #-} -- | Haskell parallel to @unison/base.Doc@. -- +-- These types have two significant parameters: @ident@ and @code@ that are expected to be parameterized by some +-- representation of identifiers and source code of the host language. +-- -- This is much more restricted than @unison/base.Doc@, but it covers everything we can parse from Haskell. The -- mismatch with Unison is a problem, as someone can create a Unison Doc with explicit constructors or function calls, -- have it rendered to a scratch file, and then we can’t parse it. Changing the types here to match Unison wouldn’t @@ -13,8 +17,6 @@ import Data.Eq.Deriving (deriveEq1, deriveEq2) import Data.List.NonEmpty (NonEmpty) import Data.Ord.Deriving (deriveOrd1, deriveOrd2) import Text.Show.Deriving (deriveShow1, deriveShow2) -import Unison.HashQualifiedPrime qualified as HQ' -import Unison.Name (Name) import Unison.Parser.Ann (Annotated (..)) import Unison.Prelude import Unison.Syntax.Lexer.Token (Token (..)) @@ -22,7 +24,7 @@ import Unison.Syntax.Lexer.Token (Token (..)) newtype UntitledSection a = UntitledSection [a] deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -data Top code a +data Top ident code a = -- | The first argument is always a `Paragraph` Section a [a] | Eval code @@ -30,7 +32,7 @@ data Top code a | CodeBlock (Token String) (Token String) | BulletedList (NonEmpty (Column a)) | NumberedList (NonEmpty (Token Word64, Column a)) - | Paragraph (NonEmpty (Leaf code a)) + | Paragraph (NonEmpty (Leaf ident code a)) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) data Column a @@ -38,11 +40,11 @@ data Column a Column a (Maybe a) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -data Leaf code a - = Link EmbedLink +data Leaf ident code a + = Link (EmbedLink ident) | -- | first is a Paragraph, second is always a Group (which contains either a single Term/Type link or list of -- `Transclude`s & `Word`s) - NamedLink a (Leaf code Void) + NamedLink a (Leaf ident code Void) | Example code | Transclude code | -- | Always a Paragraph @@ -52,21 +54,21 @@ data Leaf code a | -- | Always a Paragraph Strikethrough a | -- | Always a Word - Verbatim (Leaf Void Void) + Verbatim (Leaf ident Void Void) | -- | Always a Word - Code (Leaf Void Void) + Code (Leaf ident Void Void) | -- | Always a Transclude - Source (NonEmpty (SourceElement (Leaf code Void))) + Source (NonEmpty (SourceElement ident (Leaf ident code Void))) | -- | Always a Transclude - FoldedSource (NonEmpty (SourceElement (Leaf code Void))) + FoldedSource (NonEmpty (SourceElement ident (Leaf ident code Void))) | EvalInline code - | Signature (NonEmpty EmbedSignatureLink) - | SignatureInline EmbedSignatureLink + | Signature (NonEmpty (EmbedSignatureLink ident)) + | SignatureInline (EmbedSignatureLink ident) | Word (Token String) - | Group (Join (Leaf code a)) + | Group (Join (Leaf ident code a)) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -instance Bifunctor Leaf where +instance Bifunctor (Leaf ident) where bimap f g = \case Link x -> Link x NamedLink a leaf -> NamedLink (g a) $ first f leaf @@ -85,25 +87,25 @@ instance Bifunctor Leaf where Word x -> Word x Group join -> Group $ bimap f g <$> join -data EmbedLink - = EmbedTypeLink (Token (HQ'.HashQualified Name)) - | EmbedTermLink (Token (HQ'.HashQualified Name)) +data EmbedLink ident + = EmbedTypeLink (Token ident) + | EmbedTermLink (Token ident) deriving (Eq, Ord, Show) -data SourceElement a = SourceElement EmbedLink [EmbedAnnotation a] +data SourceElement ident a = SourceElement (EmbedLink ident) [EmbedAnnotation ident a] deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -newtype EmbedSignatureLink = EmbedSignatureLink (Token (HQ'.HashQualified Name)) +newtype EmbedSignatureLink ident = EmbedSignatureLink (Token ident) deriving (Eq, Ord, Show) newtype Join a = Join (NonEmpty a) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -newtype EmbedAnnotation a - = EmbedAnnotation (Either (Token (HQ'.HashQualified Name)) a) +newtype EmbedAnnotation ident a + = EmbedAnnotation (Either (Token ident) a) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -instance (Annotated code, Annotated a) => Annotated (Top code a) where +instance (Annotated code, Annotated a) => Annotated (Top ident code a) where ann = \case Section title body -> ann title <> ann body Eval code -> ann code @@ -116,7 +118,7 @@ instance (Annotated code, Annotated a) => Annotated (Top code a) where instance (Annotated a) => Annotated (Column a) where ann (Column para list) = ann para <> ann list -instance (Annotated code, Annotated a) => Annotated (Leaf code a) where +instance (Annotated code, Annotated a) => Annotated (Leaf ident code a) where ann = \case Link link -> ann link NamedLink label target -> ann label <> ann target @@ -135,31 +137,45 @@ instance (Annotated code, Annotated a) => Annotated (Leaf code a) where Word text -> ann text Group (Join leaves) -> ann leaves -instance Annotated EmbedLink where +instance Annotated (EmbedLink ident) where ann = \case EmbedTypeLink name -> ann name EmbedTermLink name -> ann name -instance (Annotated code) => Annotated (SourceElement code) where +instance (Annotated code) => Annotated (SourceElement ident code) where ann (SourceElement link target) = ann link <> ann target -instance Annotated EmbedSignatureLink where +instance Annotated (EmbedSignatureLink ident) where ann (EmbedSignatureLink name) = ann name -instance (Annotated code) => Annotated (EmbedAnnotation code) where +instance (Annotated code) => Annotated (EmbedAnnotation ident code) where ann (EmbedAnnotation a) = either ann ann a $(deriveEq1 ''Column) $(deriveOrd1 ''Column) $(deriveShow1 ''Column) +$(deriveEq1 ''Token) +$(deriveOrd1 ''Token) +$(deriveShow1 ''Token) + $(deriveEq1 ''EmbedAnnotation) $(deriveOrd1 ''EmbedAnnotation) $(deriveShow1 ''EmbedAnnotation) +$(deriveEq2 ''EmbedAnnotation) +$(deriveOrd2 ''EmbedAnnotation) +$(deriveShow2 ''EmbedAnnotation) + +$(deriveEq1 ''EmbedLink) +$(deriveOrd1 ''EmbedLink) +$(deriveShow1 ''EmbedLink) $(deriveEq1 ''SourceElement) $(deriveOrd1 ''SourceElement) $(deriveShow1 ''SourceElement) +$(deriveEq2 ''SourceElement) +$(deriveOrd2 ''SourceElement) +$(deriveShow2 ''SourceElement) $(deriveEq1 ''Join) $(deriveOrd1 ''Join) From 9a941a389079373465ecdd6b6baad8936f3561b1 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Fri, 26 Jul 2024 23:53:49 -0600 Subject: [PATCH 30/50] Caught a hardcoded `}}` in the Doc parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Doc parser shouldn’t know how Unison terminates Doc blocks. --- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs index 4009c30dec..cecf2ca6a2 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -131,7 +131,7 @@ paragraph :: (m () -> m code) -> m () -> m (Top ident code (Tree ident code)) -paragraph ident code = fmap Paragraph . spaced . leafy ident code +paragraph ident code docClose = fmap Paragraph . spaced docClose $ leafy ident code docClose word :: (Ord e, P.MonadParsec e String m) => m end -> m (Leaf ident code void) word closing = fmap Word . tokenP . P.try $ do @@ -385,17 +385,17 @@ namedLink :: namedLink ident code docClose = P.label "hyperlink (example: [link name](https://destination.com))" do _ <- lit "[" - p <- spaced . leafy ident code . void $ char ']' + p <- spaced docClose . leafy ident code . void $ char ']' _ <- lit "]" _ <- lit "(" target <- group $ fmap pure (link ident) <|> some' (transclude code <|> word (docClose <|> void (char ')'))) _ <- lit ")" pure $ NamedLink (wrap' $ Paragraph p) target -sp :: (P.MonadParsec e String m) => m String -sp = P.try $ do +sp :: (P.MonadParsec e String m) => m () -> m String +sp docClose = P.try $ do spaces <- P.takeWhile1P (Just "space") isSpace - close <- P.optional (P.lookAhead (lit "}}")) + close <- P.optional (P.lookAhead docClose) case close of Nothing -> guard $ ok spaces Just _ -> pure () @@ -403,8 +403,8 @@ sp = P.try $ do where ok s = length [() | '\n' <- s] < 2 -spaced :: (P.MonadParsec e String m) => m a -> m (NonEmpty a) -spaced p = some' (p <* P.optional sp) +spaced :: (P.MonadParsec e String m) => m () -> m a -> m (NonEmpty a) +spaced docClose p = some' $ p <* P.optional (sp docClose) -- | Not an actual node, but this pattern is referenced in multiple places list :: @@ -522,7 +522,7 @@ section :: S.StateT ParsingEnv m (Top ident code (Tree ident code)) section ident code docClose = do ns <- S.gets parentSections - hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp + hashes <- lift $ P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp docClose title <- lift $ paragraph ident code docClose <* CP.space let m = length hashes + head ns body <- From beecaa9be715f2090b01577e12b2e541a60404b6 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Fri, 26 Jul 2024 23:54:54 -0600 Subject: [PATCH 31/50] Make Doc parser ignorant of type/term distinctions This was the last thing tying Doc to Unison. --- parser-typechecker/src/Unison/PrintError.hs | 8 ++++ .../src/Unison/Syntax/TermParser.hs | 43 +++++++++++++------ .../src/Unison/Syntax/Lexer/Unison.hs | 12 ++++-- unison-syntax/src/Unison/Syntax/Parser.hs | 6 ++- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 36 +++------------- .../src/Unison/Syntax/Parser/Doc/Data.hs | 10 ++--- 6 files changed, 59 insertions(+), 56 deletions(-) diff --git a/parser-typechecker/src/Unison/PrintError.hs b/parser-typechecker/src/Unison/PrintError.hs index dd796c0159..691d7cd3ef 100644 --- a/parser-typechecker/src/Unison/PrintError.hs +++ b/parser-typechecker/src/Unison/PrintError.hs @@ -1861,6 +1861,14 @@ renderParseErrors s = \case <> structuralVsUniqueDocsLink ] in (msg, rangeForToken <$> [void keyword, void name]) + go (Parser.TypeNotAllowed tok) = + let msg = + Pr.lines + [ Pr.wrap "I expected to see a term here, but instead it’s a type:", + "", + tokenAsErrorSite s $ HQ.toText <$> tok + ] + in (msg, [rangeForToken tok]) unknownConstructor :: String -> L.Token (HashQualified Name) -> Pretty ColorText diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 8d0195410a..642ed0e339 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -27,6 +27,7 @@ import Data.Text qualified as Text import Data.Tuple.Extra qualified as TupleE import Data.Void (absurd, vacuous) import Text.Megaparsec qualified as P +import U.Codebase.Reference (ReferenceType (..)) import U.Core.ABT qualified as ABT import Unison.ABT qualified as ABT import Unison.Builtin.Decls qualified as DD @@ -530,7 +531,7 @@ doc2Block = do docUntitledSection ann (Doc.UntitledSection tops) = Term.app ann (f ann "UntitledSection") $ Term.list (gann tops) tops - docTop :: Doc.Top (HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m + docTop :: Doc.Top (ReferenceType, HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m docTop d = case d of Doc.Section title body -> pure $ Term.apps' (f d "Section") [title, Term.list (gann body) body] Doc.Eval code -> @@ -558,7 +559,7 @@ doc2Block = do docColumn d@(Doc.Column para sublist) = Term.app (gann d) (f d "Column") . Term.list (gann d) $ para : toList sublist - docLeaf :: Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m + docLeaf :: Doc.Leaf (ReferenceType, HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m docLeaf d = case d of Doc.Link link -> Term.app (gann d) (f d "Link") <$> docEmbedLink link Doc.NamedLink para target -> Term.apps' (f d "NamedLink") . (para :) . pure <$> docLeaf (vacuous target) @@ -590,35 +591,49 @@ doc2Block = do Term.app (gann d) (f d "Group") . Term.app (gann d) (f d "Join") . Term.list (ann leaves) . toList <$> traverse docLeaf leaves - docEmbedLink :: Doc.EmbedLink (HQ'.HashQualified Name) -> TermP v m - docEmbedLink d = case d of - Doc.EmbedTypeLink ident -> + docEmbedLink :: Doc.EmbedLink (ReferenceType, HQ'.HashQualified Name) -> TermP v m + docEmbedLink d@(Doc.EmbedLink (L.Token (level, ident) start end)) = case level of + RtType -> Term.app (gann d) (f d "EmbedTypeLink") . Term.typeLink (ann d) . L.payload - <$> findUniqueType (HQ'.toHQ <$> ident) - Doc.EmbedTermLink ident -> - Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) + <$> findUniqueType (L.Token (HQ'.toHQ ident) start end) + RtTerm -> + Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (L.Token (HQ'.toHQ ident) start end) docSourceElement :: - Doc.SourceElement (HQ'.HashQualified Name) (Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> + Doc.SourceElement + (ReferenceType, HQ'.HashQualified Name) + (Doc.Leaf (ReferenceType, HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> TermP v m docSourceElement d@(Doc.SourceElement link anns) = do link' <- docEmbedLink link anns' <- traverse docEmbedAnnotation anns pure $ Term.apps' (f d "SourceElement") [link', Term.list (ann anns) anns'] - docEmbedSignatureLink :: Doc.EmbedSignatureLink (HQ'.HashQualified Name) -> TermP v m - docEmbedSignatureLink d@(Doc.EmbedSignatureLink ident) = - Term.app (gann d) (f d "EmbedSignatureLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) + docEmbedSignatureLink :: Doc.EmbedSignatureLink (ReferenceType, HQ'.HashQualified Name) -> TermP v m + docEmbedSignatureLink d@(Doc.EmbedSignatureLink (L.Token (level, ident) start end)) = case level of + RtType -> P.customFailure . TypeNotAllowed $ L.Token (HQ'.toHQ ident) start end + RtTerm -> + Term.app (gann d) (f d "EmbedSignatureLink") . addDelay + <$> resolveHashQualified (L.Token (HQ'.toHQ ident) start end) docEmbedAnnotation :: - Doc.EmbedAnnotation (HQ'.HashQualified Name) (Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> + Doc.EmbedAnnotation + (ReferenceType, HQ'.HashQualified Name) + (Doc.Leaf (ReferenceType, HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> TermP v m docEmbedAnnotation d@(Doc.EmbedAnnotation a) = -- This is the only place I’m not sure we’re doing the right thing. In the lexer, this can be an identifier or a -- DocLeaf, but here it could be either /text/ or a Doc element. And I don’t think there’s any way the lexemes -- produced for an identifier and the lexemes consumed for text line up. So, I think this is a bugfix I can’t -- avoid. - Term.app (gann d) (f d "EmbedAnnotation") <$> either (resolveHashQualified . fmap HQ'.toHQ) (docLeaf . vacuous) a + Term.app (gann d) (f d "EmbedAnnotation") + <$> either + ( \(L.Token (level, ident) start end) -> case level of + RtType -> P.customFailure . TypeNotAllowed $ L.Token (HQ'.toHQ ident) start end + RtTerm -> resolveHashQualified $ L.Token (HQ'.toHQ ident) start end + ) + (docLeaf . vacuous) + a docBlock :: (Monad m, Var v) => TermP v m docBlock = do diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs index 98112c2124..8a6c20d1a8 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs @@ -42,6 +42,7 @@ import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Text.Megaparsec.Error qualified as EP import Text.Megaparsec.Internal qualified as PI +import U.Codebase.Reference (ReferenceType (..)) import Unison.HashQualifiedPrime qualified as HQ' import Unison.Name (Name) import Unison.Name qualified as Name @@ -117,7 +118,7 @@ data Lexeme | Bytes Bytes.Bytes -- bytes literals | Hash ShortHash -- hash literals | Err Err - | Doc (Doc.UntitledSection (Doc.Tree (HQ'.HashQualified Name) [Token Lexeme])) + | Doc (Doc.UntitledSection (Doc.Tree (ReferenceType, HQ'.HashQualified Name) [Token Lexeme])) deriving stock (Eq, Show, Ord) type IsVirtual = Bool -- is it a virtual semi or an actual semi? @@ -354,7 +355,7 @@ doc2 = do (docTok, closeTok) <- local (\env -> env {inLayout = False}) do - body <- Doc.doc identifierP lexemes' . P.lookAhead $ () <$ lit "}}" + body <- Doc.doc typeOrTerm lexemes' . P.lookAhead $ () <$ lit "}}" closeStart <- posP lit "}}" closeEnd <- posP @@ -382,12 +383,15 @@ doc2 = do isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 _ -> docTok : endToks where - -- DUPLICATED wordyKw kw = separated wordySep (lit kw) + typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) + typeOrTerm = do + mtype <- P.optional $ typeOrAbility' <* CP.space + ident <- identifierP <* CP.space + pure (maybe RtTerm (const RtType) mtype, ident) subsequentTypeName = P.lookAhead . P.optional $ do let lit' s = lit s <* sp let modifier = typeModifiersAlt (lit' . Text.unpack) - let typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) _ <- optional modifier *> typeOrAbility' *> sp Token name start stop <- tokenP identifierP if Name.isSymboly (HQ'.toName name) diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index fac55142de..6c4aa74b95 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -72,6 +72,7 @@ import Data.Set qualified as Set import Data.Text qualified as Text import Text.Megaparsec (runParserT) import Text.Megaparsec qualified as P +import U.Codebase.Reference (ReferenceType (..)) import U.Util.Base32Hex qualified as Base32Hex import Unison.ABT qualified as ABT import Unison.ConstructorReference (ConstructorReference) @@ -170,6 +171,8 @@ data Error v | TypeDeclarationErrors [UF.Error v Ann] | -- | MissingTypeModifier (type|ability) name MissingTypeModifier (L.Token String) (L.Token v) + | -- | A type was found in a position that requires a term + TypeNotAllowed (L.Token (HQ.HashQualified Name)) | ResolutionFailures [Names.ResolutionFailure v Ann] | DuplicateTypeNames [(v, [Ann])] | DuplicateTermNames [(v, [Ann])] @@ -401,7 +404,8 @@ string = queryToken getString getString (L.Textual s) = Just (Text.pack s) getString _ = Nothing -doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection (Doc.Tree (HQ'.HashQualified Name) [L.Token L.Lexeme]))) +doc :: + (Ord v) => P v m (L.Token (Doc.UntitledSection (Doc.Tree (ReferenceType, HQ'.HashQualified Name) [L.Token L.Lexeme]))) doc = queryToken \case L.Doc d -> pure d _ -> Nothing diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs index cecf2ca6a2..8ba6840dd2 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -45,8 +45,7 @@ module Unison.Syntax.Parser.Doc -- * other components column', - embedTypeLink, - embedTermLink, + embedLink, embedSignatureLink, join, ) @@ -59,26 +58,13 @@ import Data.List qualified as List import Data.List.Extra qualified as List import Data.List.NonEmpty (NonEmpty ((:|))) import Data.List.NonEmpty qualified as NonEmpty -import Data.Text qualified as Text import Text.Megaparsec qualified as P import Text.Megaparsec.Char (char) import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Unison.Parser.Ann (Ann, Annotated (..)) import Unison.Prelude hiding (join) -import Unison.Syntax.Lexer - ( column, - line, - lit, - local, - sepBy1', - separated, - some', - someTill', - typeOrAbilityAlt, - wordySep, - (<+>), - ) +import Unison.Syntax.Lexer (column, line, lit, local, sepBy1', some', someTill', (<+>)) import Unison.Syntax.Lexer.Token (Token (Token), posP, tokenP) import Unison.Syntax.Parser.Doc.Data @@ -109,9 +95,6 @@ doc ident code = flip S.evalStateT initialState . untitledSection . sectionElem untitledSection :: (P.MonadParsec e String m) => m a -> m (UntitledSection a) untitledSection a = UntitledSection <$> P.many (a <* CP.space) -wordyKw :: (P.MonadParsec e String m) => String -> m String -wordyKw kw = separated wordySep (lit kw) - sectionElem :: (Ord e, P.MonadParsec e String m, Annotated code) => m ident -> @@ -228,14 +211,9 @@ evalInline code = fmap EvalInline $ do s <- code inlineEvalClose pure s -embedTypeLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) -embedTypeLink ident = - EmbedTypeLink <$> do - _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tokenP ident <* CP.space - -embedTermLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) -embedTermLink ident = EmbedTermLink <$> tokenP ident <* CP.space +-- | Not an actual node, but this pattern is referenced in multiple places +embedLink :: (Ord e, P.MonadParsec e s m, P.TraversableStream s) => m ident -> m (EmbedLink ident) +embedLink = fmap EmbedLink . tokenP embedSignatureLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedSignatureLink ident) embedSignatureLink ident = EmbedSignatureLink <$> tokenP ident <* CP.space @@ -530,10 +508,6 @@ section ident code docClose = do P.many (sectionElem ident code docClose <* CP.space) pure $ Section (wrap' title) body --- | Not an actual node, but this pattern is referenced in multiple places -embedLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) -embedLink ident = embedTypeLink ident <|> embedTermLink ident - -- | FIXME: This should just take a @`P` code@ and @`P` a@. group :: (P.MonadParsec e s m) => m (NonEmpty (Leaf ident code a)) -> m (Leaf ident code a) group = fmap Group . join diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs index 56a14939b6..75bc3a621e 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs @@ -87,9 +87,9 @@ instance Bifunctor (Leaf ident) where Word x -> Word x Group join -> Group $ bimap f g <$> join -data EmbedLink ident - = EmbedTypeLink (Token ident) - | EmbedTermLink (Token ident) +-- | This is a deviation from the Unison Doc data model – in Unison, Doc distinguishes between type and term links, but +-- here Doc knows nothing about what namespaces may exist. +data EmbedLink ident = EmbedLink (Token ident) deriving (Eq, Ord, Show) data SourceElement ident a = SourceElement (EmbedLink ident) [EmbedAnnotation ident a] @@ -138,9 +138,7 @@ instance (Annotated code, Annotated a) => Annotated (Leaf ident code a) where Group (Join leaves) -> ann leaves instance Annotated (EmbedLink ident) where - ann = \case - EmbedTypeLink name -> ann name - EmbedTermLink name -> ann name + ann (EmbedLink name) = ann name instance (Annotated code) => Annotated (SourceElement ident code) where ann (SourceElement link target) = ann link <> ann target From 94209eae14bc9600d3fcdc382263f1768230d408 Mon Sep 17 00:00:00 2001 From: Travis Staton Date: Mon, 29 Jul 2024 19:45:48 -0400 Subject: [PATCH 32/50] permit empty matches --- .../src/Unison/PatternMatchCoverage.hs | 19 ++++++++++++++----- .../Unison/PatternMatchCoverage/Desugar.hs | 7 ++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/parser-typechecker/src/Unison/PatternMatchCoverage.hs b/parser-typechecker/src/Unison/PatternMatchCoverage.hs index 7a431a486a..62d04167ff 100644 --- a/parser-typechecker/src/Unison/PatternMatchCoverage.hs +++ b/parser-typechecker/src/Unison/PatternMatchCoverage.hs @@ -35,6 +35,7 @@ module Unison.PatternMatchCoverage ) where +import Data.List.NonEmpty (nonEmpty) import Data.Set qualified as Set import Debug.Trace import Unison.Debug @@ -64,13 +65,16 @@ checkMatch :: checkMatch matchLocation scrutineeType cases = do ppe <- getPrettyPrintEnv v0 <- fresh - grdtree0 <- desugarMatch matchLocation scrutineeType v0 cases - doDebug (P.hang (title "desugared:") (prettyGrdTree (prettyPmGrd ppe) (\_ -> "") grdtree0)) (pure ()) - (uncovered, grdtree1) <- uncoverAnnotate (Set.singleton (NC.markDirty v0 $ NC.declVar v0 scrutineeType id NC.emptyNormalizedConstraints)) grdtree0 + mgrdtree0 <- traverse (desugarMatch matchLocation scrutineeType v0) (nonEmpty cases) + doDebug (P.hang (title "desugared:") (prettyGrdTreeMaybe (prettyPmGrd ppe) (\_ -> "") mgrdtree0)) (pure ()) + let initialUncovered = Set.singleton (NC.markDirty v0 $ NC.declVar v0 scrutineeType id NC.emptyNormalizedConstraints) + (uncovered, grdtree1) <- case mgrdtree0 of + Nothing -> pure (initialUncovered, Nothing) + Just grdtree0 -> fmap Just <$> uncoverAnnotate initialUncovered grdtree0 doDebug ( P.sep "\n" - [ P.hang (title "annotated:") (prettyGrdTree (NC.prettyDnf ppe) (NC.prettyDnf ppe . fst) grdtree1), + [ P.hang (title "annotated:") (prettyGrdTreeMaybe (NC.prettyDnf ppe) (NC.prettyDnf ppe . fst) grdtree1), P.hang (title "uncovered:") (NC.prettyDnf ppe uncovered) ] ) @@ -78,9 +82,14 @@ checkMatch matchLocation scrutineeType cases = do uncoveredExpanded <- concat . fmap Set.toList <$> traverse (expandSolution v0) (Set.toList uncovered) doDebug (P.hang (title "uncovered expanded:") (NC.prettyDnf ppe (Set.fromList uncoveredExpanded))) (pure ()) let sols = map (generateInhabitants v0) uncoveredExpanded - let (_accessible, inaccessible, redundant) = classify grdtree1 + let (_accessible, inaccessible, redundant) = case grdtree1 of + Nothing -> ([], [], []) + Just x -> classify x pure (redundant, inaccessible, sols) where + prettyGrdTreeMaybe prettyNode prettyLeaf = \case + Nothing -> "" + Just x -> prettyGrdTree prettyNode prettyLeaf x title = P.bold doDebug out = case shouldDebug PatternCoverage of True -> trace (P.toAnsiUnbroken out) diff --git a/parser-typechecker/src/Unison/PatternMatchCoverage/Desugar.hs b/parser-typechecker/src/Unison/PatternMatchCoverage/Desugar.hs index ce015cc51b..28bf29b754 100644 --- a/parser-typechecker/src/Unison/PatternMatchCoverage/Desugar.hs +++ b/parser-typechecker/src/Unison/PatternMatchCoverage/Desugar.hs @@ -27,12 +27,9 @@ desugarMatch :: -- | scrutinee variable v -> -- | match cases - [MatchCase loc (Term' vt v loc)] -> + NonEmpty (MatchCase loc (Term' vt v loc)) -> m (GrdTree (PmGrd vt v loc) loc) -desugarMatch loc0 scrutineeType v0 cs0 = - traverse desugarClause cs0 >>= \case - [] -> pure $ Leaf loc0 - x : xs -> pure $ Fork (x :| xs) +desugarMatch loc0 scrutineeType v0 cs0 = Fork <$> traverse desugarClause cs0 where desugarClause :: MatchCase loc (Term' vt v loc) -> m (GrdTree (PmGrd vt v loc) loc) desugarClause MatchCase {matchPattern, matchGuard} = From 72da81f18ba7204f4a3d8a2eaad83f218f65d7b6 Mon Sep 17 00:00:00 2001 From: Travis Staton Date: Mon, 29 Jul 2024 21:21:09 -0400 Subject: [PATCH 33/50] remove unused arg --- parser-typechecker/src/Unison/PatternMatchCoverage.hs | 6 ++---- .../src/Unison/PatternMatchCoverage/Desugar.hs | 4 +--- parser-typechecker/src/Unison/Typechecker/Context.hs | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/parser-typechecker/src/Unison/PatternMatchCoverage.hs b/parser-typechecker/src/Unison/PatternMatchCoverage.hs index 62d04167ff..30973b8256 100644 --- a/parser-typechecker/src/Unison/PatternMatchCoverage.hs +++ b/parser-typechecker/src/Unison/PatternMatchCoverage.hs @@ -54,18 +54,16 @@ import Unison.Util.Pretty qualified as P checkMatch :: forall vt v loc m. (Pmc vt v loc m) => - -- | the match location - loc -> -- | scrutinee type Type.Type vt loc -> -- | match cases [Term.MatchCase loc (Term.Term' vt v loc)] -> -- | (redundant locations, inaccessible locations, inhabitants of uncovered refinement type) m ([loc], [loc], [Pattern ()]) -checkMatch matchLocation scrutineeType cases = do +checkMatch scrutineeType cases = do ppe <- getPrettyPrintEnv v0 <- fresh - mgrdtree0 <- traverse (desugarMatch matchLocation scrutineeType v0) (nonEmpty cases) + mgrdtree0 <- traverse (desugarMatch scrutineeType v0) (nonEmpty cases) doDebug (P.hang (title "desugared:") (prettyGrdTreeMaybe (prettyPmGrd ppe) (\_ -> "") mgrdtree0)) (pure ()) let initialUncovered = Set.singleton (NC.markDirty v0 $ NC.declVar v0 scrutineeType id NC.emptyNormalizedConstraints) (uncovered, grdtree1) <- case mgrdtree0 of diff --git a/parser-typechecker/src/Unison/PatternMatchCoverage/Desugar.hs b/parser-typechecker/src/Unison/PatternMatchCoverage/Desugar.hs index 28bf29b754..8587d44d6c 100644 --- a/parser-typechecker/src/Unison/PatternMatchCoverage/Desugar.hs +++ b/parser-typechecker/src/Unison/PatternMatchCoverage/Desugar.hs @@ -20,8 +20,6 @@ import Unison.Type qualified as Type desugarMatch :: forall loc vt v m. (Pmc vt v loc m) => - -- | loc of match - loc -> -- | scrutinee type Type vt loc -> -- | scrutinee variable @@ -29,7 +27,7 @@ desugarMatch :: -- | match cases NonEmpty (MatchCase loc (Term' vt v loc)) -> m (GrdTree (PmGrd vt v loc) loc) -desugarMatch loc0 scrutineeType v0 cs0 = Fork <$> traverse desugarClause cs0 +desugarMatch scrutineeType v0 cs0 = Fork <$> traverse desugarClause cs0 where desugarClause :: MatchCase loc (Term' vt v loc) -> m (GrdTree (PmGrd vt v loc) loc) desugarClause MatchCase {matchPattern, matchGuard} = diff --git a/parser-typechecker/src/Unison/Typechecker/Context.hs b/parser-typechecker/src/Unison/Typechecker/Context.hs index 89eb193212..214fe95a0c 100644 --- a/parser-typechecker/src/Unison/Typechecker/Context.hs +++ b/parser-typechecker/src/Unison/Typechecker/Context.hs @@ -1525,7 +1525,7 @@ ensurePatternCoverage theMatch _theMatchType _scrutinee scrutineeType cases = do constructorCache = mempty } (redundant, _inaccessible, uncovered) <- flip evalStateT pmcState do - checkMatch matchLoc scrutineeType cases + checkMatch scrutineeType cases let checkUncovered = case Nel.nonEmpty uncovered of Nothing -> pure () Just xs -> failWith (UncoveredPatterns matchLoc xs) From 03b225ccd18a3edcc418127ef1f8c0fe98741393 Mon Sep 17 00:00:00 2001 From: Chris Penner Date: Wed, 31 Jul 2024 15:58:28 -0700 Subject: [PATCH 34/50] Add ability to find over EVERY branch. --- .../src/Unison/Codebase/Editor/HandleInput.hs | 124 ++++++++++-------- .../Codebase/Editor/HandleInput/Global.hs | 22 ++++ .../src/Unison/Codebase/Editor/Input.hs | 4 +- .../src/Unison/Codebase/Editor/Output.hs | 3 +- .../src/Unison/CommandLine/InputPatterns.hs | 6 +- unison-cli/unison-cli.cabal | 1 + 6 files changed, 96 insertions(+), 64 deletions(-) create mode 100644 unison-cli/src/Unison/Codebase/Editor/HandleInput/Global.hs diff --git a/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs b/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs index e85879cc4a..65b2fb781d 100644 --- a/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs +++ b/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs @@ -63,6 +63,7 @@ import Unison.Codebase.Editor.HandleInput.DeleteProject (handleDeleteProject) import Unison.Codebase.Editor.HandleInput.EditNamespace (handleEditNamespace) import Unison.Codebase.Editor.HandleInput.FindAndReplace (handleStructuredFindI, handleStructuredFindReplaceI) import Unison.Codebase.Editor.HandleInput.FormatFile qualified as Format +import Unison.Codebase.Editor.HandleInput.Global qualified as Global import Unison.Codebase.Editor.HandleInput.InstallLib (handleInstallLib) import Unison.Codebase.Editor.HandleInput.LSPDebug qualified as LSPDebug import Unison.Codebase.Editor.HandleInput.Load (EvalMode (Sandboxed), evalUnisonFile, handleLoad, loadUnisonFile) @@ -1089,7 +1090,7 @@ handleFindI :: Cli () handleFindI isVerbose fscope ws input = do Cli.Env {codebase} <- ask - (pped, names, searchRoot, branch0) <- case fscope of + case fscope of FindLocal p -> do searchRoot <- Cli.resolvePath' p branch0 <- Cli.getBranch0FromProjectPath searchRoot @@ -1097,7 +1098,21 @@ handleFindI isVerbose fscope ws input = do -- Don't exclude anything from the pretty printer, since the type signatures we print for -- results may contain things in lib. pped <- Cli.currentPrettyPrintEnvDecl - pure (pped, names, Just p, branch0) + let suffixifiedPPE = PPED.suffixifiedPPE pped + results <- searchBranch0 codebase branch0 names + if (null results) + then do + Cli.respond FindNoLocalMatches + -- We've already searched everything else, so now we search JUST the + -- names in lib. + let mayOnlyLibBranch = branch0 & Branch.children %%~ \cs -> Map.singleton NameSegment.libSegment <$> Map.lookup NameSegment.libSegment cs + case mayOnlyLibBranch of + Nothing -> respondResults codebase suffixifiedPPE (Just p) [] + Just onlyLibBranch -> do + let onlyLibNames = Branch.toNames onlyLibBranch + results <- searchBranch0 codebase branch0 onlyLibNames + respondResults codebase suffixifiedPPE (Just p) results + else respondResults codebase suffixifiedPPE (Just p) results FindLocalAndDeps p -> do searchRoot <- Cli.resolvePath' p branch0 <- Cli.getBranch0FromProjectPath searchRoot @@ -1105,64 +1120,57 @@ handleFindI isVerbose fscope ws input = do -- Don't exclude anything from the pretty printer, since the type signatures we print for -- results may contain things in lib. pped <- Cli.currentPrettyPrintEnvDecl - pure (pped, names, Just p, branch0) + let suffixifiedPPE = PPED.suffixifiedPPE pped + results <- searchBranch0 codebase branch0 names + respondResults codebase suffixifiedPPE (Just p) results FindGlobal -> do - -- TODO: Rewrite to be properly global again - projectRootNames <- Names.makeAbsolute . Branch.toNames <$> Cli.getCurrentProjectRoot0 - pped <- Cli.prettyPrintEnvDeclFromNames projectRootNames - currentBranch0 <- Cli.getCurrentBranch0 - pure (pped, projectRootNames, Nothing, currentBranch0) - let suffixifiedPPE = PPED.suffixifiedPPE pped - let getResults :: Names -> Cli [SearchResult] - getResults names = - case ws of - [] -> pure (List.sortBy SR.compareByName (SR.fromNames names)) - -- type query - ":" : ws -> do - typ <- parseSearchType (show input) (unwords ws) - let keepNamed = Set.intersection (Branch.deepReferents branch0) - (noExactTypeMatches, matches) <- do - Cli.runTransaction do - matches <- keepNamed <$> Codebase.termsOfType codebase typ - if null matches - then (True,) . keepNamed <$> Codebase.termsMentioningType codebase typ - else pure (False, matches) - when noExactTypeMatches (Cli.respond NoExactTypeMatches) - pure $ - -- in verbose mode, aliases are shown, so we collapse all - -- aliases to a single search result; in non-verbose mode, - -- a separate result may be shown for each alias - (if isVerbose then uniqueBy SR.toReferent else id) $ - searchResultsFor names (Set.toList matches) [] + Global.forAllProjectBranches \(projAndBranchNames, _ids) branch -> do + let branch0 = Branch.head branch + let projectRootNames = Names.makeAbsolute . Branch.toNames $ branch0 + pped <- Cli.prettyPrintEnvDeclFromNames projectRootNames + results <- searchBranch0 codebase branch0 projectRootNames + when (not $ null results) do + Cli.setNumberedArgs $ fmap (SA.SearchResult Nothing) results + results' <- Cli.runTransaction (Backend.loadSearchResults codebase results) + Cli.respond $ GlobalFindBranchResults projAndBranchNames (PPED.suffixifiedPPE pped) isVerbose results' + where + searchBranch0 :: Codebase.Codebase m Symbol Ann -> Branch0 IO -> Names -> Cli [SearchResult] + searchBranch0 codebase branch0 names = + case ws of + [] -> pure (List.sortBy SR.compareByName (SR.fromNames names)) + -- type query + ":" : ws -> do + typ <- parseSearchType (show input) (unwords ws) + let keepNamed = Set.intersection (Branch.deepReferents branch0) + (noExactTypeMatches, matches) <- do + Cli.runTransaction do + matches <- keepNamed <$> Codebase.termsOfType codebase typ + if null matches + then (True,) . keepNamed <$> Codebase.termsMentioningType codebase typ + else pure (False, matches) + when noExactTypeMatches (Cli.respond NoExactTypeMatches) + pure $ + -- in verbose mode, aliases are shown, so we collapse all + -- aliases to a single search result; in non-verbose mode, + -- a separate result may be shown for each alias + (if isVerbose then uniqueBy SR.toReferent else id) $ + searchResultsFor names (Set.toList matches) [] - -- name query - qs -> do - let anythingBeforeHash :: Megaparsec.Parsec (Lexer.Token Text) [Char] Text - anythingBeforeHash = Text.pack <$> Megaparsec.takeWhileP Nothing (/= '#') - let srs = - searchBranchScored - names - Find.simpleFuzzyScore - (mapMaybe (HQ.parseTextWith anythingBeforeHash . Text.pack) qs) - pure $ uniqueBy SR.toReferent srs - let respondResults results = do - Cli.setNumberedArgs $ fmap (SA.SearchResult searchRoot) results - results' <- Cli.runTransaction (Backend.loadSearchResults codebase results) - Cli.respond $ ListOfDefinitions fscope suffixifiedPPE isVerbose results' - results <- getResults names - case (results, fscope) of - ([], FindLocal {}) -> do - Cli.respond FindNoLocalMatches - -- We've already searched everything else, so now we search JUST the - -- names in lib. - let mayOnlyLibBranch = branch0 & Branch.children %%~ \cs -> Map.singleton NameSegment.libSegment <$> Map.lookup NameSegment.libSegment cs - case mayOnlyLibBranch of - Nothing -> respondResults [] - Just onlyLibBranch -> do - let onlyLibNames = Branch.toNames onlyLibBranch - results <- getResults onlyLibNames - respondResults results - _ -> respondResults results + -- name query + qs -> do + let anythingBeforeHash :: Megaparsec.Parsec (Lexer.Token Text) [Char] Text + anythingBeforeHash = Text.pack <$> Megaparsec.takeWhileP Nothing (/= '#') + let srs = + searchBranchScored + names + Find.simpleFuzzyScore + (mapMaybe (HQ.parseTextWith anythingBeforeHash . Text.pack) qs) + pure $ uniqueBy SR.toReferent srs + respondResults :: Codebase.Codebase m Symbol Ann -> PPE.PrettyPrintEnv -> Maybe Path' -> [SearchResult] -> Cli () + respondResults codebase ppe searchRoot results = do + Cli.setNumberedArgs $ fmap (SA.SearchResult searchRoot) results + results' <- Cli.runTransaction (Backend.loadSearchResults codebase results) + Cli.respond $ ListOfDefinitions fscope ppe isVerbose results' handleDependencies :: HQ.HashQualified Name -> Cli () handleDependencies hq = do diff --git a/unison-cli/src/Unison/Codebase/Editor/HandleInput/Global.hs b/unison-cli/src/Unison/Codebase/Editor/HandleInput/Global.hs new file mode 100644 index 0000000000..1306497b61 --- /dev/null +++ b/unison-cli/src/Unison/Codebase/Editor/HandleInput/Global.hs @@ -0,0 +1,22 @@ +module Unison.Codebase.Editor.HandleInput.Global (forAllProjectBranches) where + +import Control.Monad.Reader +import U.Codebase.Sqlite.DbId (ProjectBranchId, ProjectId) +import U.Codebase.Sqlite.Queries qualified as Q +import Unison.Cli.Monad (Cli) +import Unison.Cli.Monad qualified as Cli +import Unison.Codebase qualified as Codebase +import Unison.Codebase.Branch (Branch) +import Unison.Core.Project +import Unison.Prelude +import Unison.Util.Monoid (foldMapM) + +-- | Map over ALL project branches in the codebase. +-- This is a _very_ big hammer, that you should basically never use, except for things like debugging or migrations. +forAllProjectBranches :: (Monoid r) => ((ProjectAndBranch ProjectName ProjectBranchName, ProjectAndBranch ProjectId ProjectBranchId) -> Branch IO -> Cli r) -> Cli r +forAllProjectBranches f = do + Cli.Env {codebase} <- ask + projectBranches <- Cli.runTransaction Q.loadAllProjectBranchNamePairs + projectBranches & foldMapM \(names, ids@(ProjectAndBranch projId branchId)) -> do + b <- liftIO $ Codebase.expectProjectBranchRoot codebase projId branchId + f (names, ids) b diff --git a/unison-cli/src/Unison/Codebase/Editor/Input.hs b/unison-cli/src/Unison/Codebase/Editor/Input.hs index d0bc3ae9d2..e736c618bd 100644 --- a/unison-cli/src/Unison/Codebase/Editor/Input.hs +++ b/unison-cli/src/Unison/Codebase/Editor/Input.hs @@ -127,8 +127,8 @@ data Input | PushRemoteBranchI PushRemoteBranchInput | ResetI (BranchId2 {- namespace to reset it to -}) (Maybe UnresolvedProjectBranch {- ProjectBranch to reset -}) -- todo: Q: Does it make sense to publish to not-the-root of a Github repo? - -- Does it make sense to fork from not-the-root of a Github repo? - | -- used in Welcome module to give directions to user + | -- Does it make sense to fork from not-the-root of a Github repo? + -- used in Welcome module to give directions to user CreateMessage (P.Pretty P.ColorText) | -- Change directory. SwitchBranchI Path' diff --git a/unison-cli/src/Unison/Codebase/Editor/Output.hs b/unison-cli/src/Unison/Codebase/Editor/Output.hs index 6ae0b23616..b8c13900a6 100644 --- a/unison-cli/src/Unison/Codebase/Editor/Output.hs +++ b/unison-cli/src/Unison/Codebase/Editor/Output.hs @@ -261,7 +261,6 @@ data Output | MovedOverExistingBranch Path' | DeletedEverything | ListNames - IsGlobal Int -- hq length to print References [(Reference, [HQ'.HashQualified Name])] -- type match, type names [(Referent, [HQ'.HashQualified Name])] -- term match, term names @@ -269,6 +268,7 @@ data Output | ListOfDefinitions FindScope PPE.PrettyPrintEnv ListDetailed [SearchResult' Symbol Ann] | ListShallow (IO PPE.PrettyPrintEnv) [ShallowListEntry Symbol Ann] | ListStructuredFind [HQ.HashQualified Name] + | GlobalFindBranchResults (ProjectAndBranch ProjectName ProjectBranchName) PPE.PrettyPrintEnv ListDetailed [SearchResult' Symbol Ann] | -- ListStructuredFind patternMatchingUsages termBodyUsages -- show the result of add/update SlurpOutput Input PPE.PrettyPrintEnv SlurpResult @@ -545,6 +545,7 @@ isFailure o = case o of DeletedEverything -> False ListNames _ _ tys tms -> null tms && null tys ListOfDefinitions _ _ _ ds -> null ds + GlobalFindBranchResults _ _ _ _ -> False ListStructuredFind tms -> null tms SlurpOutput _ _ sr -> not $ SR.isOk sr ParseErrors {} -> True diff --git a/unison-cli/src/Unison/CommandLine/InputPatterns.hs b/unison-cli/src/Unison/CommandLine/InputPatterns.hs index 671265a960..f9a9effcb7 100644 --- a/unison-cli/src/Unison/CommandLine/InputPatterns.hs +++ b/unison-cli/src/Unison/CommandLine/InputPatterns.hs @@ -1149,7 +1149,7 @@ findAll :: InputPattern findAll = find' "find.all" (Input.FindLocalAndDeps Path.relativeEmpty') findGlobal :: InputPattern -findGlobal = find' "find.global" Input.FindGlobal +findGlobal = find' "debug.find.global" Input.FindGlobal findIn, findInAll :: InputPattern findIn = findIn' "find-in" Input.FindLocal @@ -1197,8 +1197,8 @@ findHelp = "lists all definitions with a name similar to 'foo' or 'bar' in the " <> "specified subnamespace (including one level of its 'lib')." ), - ( "find.global foo", - "lists all definitions with a name similar to 'foo' in any namespace" + ( "debug.find.global foo", + "Iteratively searches all projects and branches and lists all definitions with a name similar to 'foo'. Note that this is a very slow operation." ) ] ) diff --git a/unison-cli/unison-cli.cabal b/unison-cli/unison-cli.cabal index 77030bfdf6..2bdd255a12 100644 --- a/unison-cli/unison-cli.cabal +++ b/unison-cli/unison-cli.cabal @@ -64,6 +64,7 @@ library Unison.Codebase.Editor.HandleInput.EditNamespace Unison.Codebase.Editor.HandleInput.FindAndReplace Unison.Codebase.Editor.HandleInput.FormatFile + Unison.Codebase.Editor.HandleInput.Global Unison.Codebase.Editor.HandleInput.InstallLib Unison.Codebase.Editor.HandleInput.Load Unison.Codebase.Editor.HandleInput.Ls From d4a04b73492bdfa013108921ef5d8e33e04df163 Mon Sep 17 00:00:00 2001 From: Chris Penner Date: Wed, 31 Jul 2024 15:58:28 -0700 Subject: [PATCH 35/50] re-add names.global --- .../src/Unison/Codebase/Editor/HandleInput.hs | 36 ++++--- .../src/Unison/Codebase/Editor/Output.hs | 8 +- .../src/Unison/CommandLine/InputPatterns.hs | 9 +- .../src/Unison/CommandLine/OutputMessages.hs | 101 ++++++++++-------- 4 files changed, 90 insertions(+), 64 deletions(-) diff --git a/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs b/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs index 65b2fb781d..ae03247421 100644 --- a/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs +++ b/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs @@ -498,23 +498,27 @@ loop e = do fixupOutput = HQ'.toHQ . Path.nameFromHQSplit NamesI global query -> do hqLength <- Cli.runTransaction Codebase.hashLength - (names, pped) <- - if global - then do - error "TODO: Implement names.global." - else do - names <- Cli.currentNames + let searchNames names = do pped <- Cli.prettyPrintEnvDeclFromNames names - pure (names, pped) - - let unsuffixifiedPPE = PPED.unsuffixifiedPPE pped - terms = Names.lookupHQTerm Names.IncludeSuffixes query names - types = Names.lookupHQType Names.IncludeSuffixes query names - terms' :: [(Referent, [HQ'.HashQualified Name])] - terms' = map (\r -> (r, PPE.allTermNames unsuffixifiedPPE r)) (Set.toList terms) - types' :: [(Reference, [HQ'.HashQualified Name])] - types' = map (\r -> (r, PPE.allTypeNames unsuffixifiedPPE r)) (Set.toList types) - Cli.respond $ ListNames global hqLength types' terms' + let unsuffixifiedPPE = PPED.unsuffixifiedPPE pped + terms = Names.lookupHQTerm Names.IncludeSuffixes query names + types = Names.lookupHQType Names.IncludeSuffixes query names + terms' :: [(Referent, [HQ'.HashQualified Name])] + terms' = map (\r -> (r, PPE.allTermNames unsuffixifiedPPE r)) (Set.toList terms) + types' :: [(Reference, [HQ'.HashQualified Name])] + types' = map (\r -> (r, PPE.allTypeNames unsuffixifiedPPE r)) (Set.toList types) + pure (terms', types') + if global + then do + Global.forAllProjectBranches \(projBranchNames, _ids) branch -> do + let names = Branch.toNames . Branch.head $ branch + (terms, types) <- searchNames names + when (not (null terms) || not (null types)) do + Cli.respond $ GlobalListNames projBranchNames hqLength types terms + else do + names <- Cli.currentNames + (terms, types) <- searchNames names + Cli.respond $ ListNames hqLength types terms DocsI srcs -> do for_ srcs docsI CreateAuthorI authorNameSegment authorFullName -> do diff --git a/unison-cli/src/Unison/Codebase/Editor/Output.hs b/unison-cli/src/Unison/Codebase/Editor/Output.hs index b8c13900a6..7a59f4ac96 100644 --- a/unison-cli/src/Unison/Codebase/Editor/Output.hs +++ b/unison-cli/src/Unison/Codebase/Editor/Output.hs @@ -264,6 +264,11 @@ data Output Int -- hq length to print References [(Reference, [HQ'.HashQualified Name])] -- type match, type names [(Referent, [HQ'.HashQualified Name])] -- term match, term names + | GlobalListNames + (ProjectAndBranch ProjectName ProjectBranchName) + Int -- hq length to print References + [(Reference, [HQ'.HashQualified Name])] -- type match, type names + [(Referent, [HQ'.HashQualified Name])] -- term match, term names -- list of all the definitions within this branch | ListOfDefinitions FindScope PPE.PrettyPrintEnv ListDetailed [SearchResult' Symbol Ann] | ListShallow (IO PPE.PrettyPrintEnv) [ShallowListEntry Symbol Ann] @@ -543,7 +548,8 @@ isFailure o = case o of MoveRootBranchConfirmation -> False MovedOverExistingBranch {} -> False DeletedEverything -> False - ListNames _ _ tys tms -> null tms && null tys + ListNames _ tys tms -> null tms && null tys + GlobalListNames {} -> False ListOfDefinitions _ _ _ ds -> null ds GlobalFindBranchResults _ _ _ _ -> False ListStructuredFind tms -> null tms diff --git a/unison-cli/src/Unison/CommandLine/InputPatterns.hs b/unison-cli/src/Unison/CommandLine/InputPatterns.hs index f9a9effcb7..38d24809de 100644 --- a/unison-cli/src/Unison/CommandLine/InputPatterns.hs +++ b/unison-cli/src/Unison/CommandLine/InputPatterns.hs @@ -2611,12 +2611,15 @@ names isGlobal = [] I.Visible [("name or hash", Required, definitionQueryArg)] - (P.wrap $ makeExample (names isGlobal) ["foo"] <> " shows the hash and all known names for `foo`.") + (P.wrap $ makeExample (names isGlobal) ["foo"] <> description) $ \case [thing] -> Input.NamesI isGlobal <$> handleHashQualifiedNameArg thing args -> wrongArgsLength "exactly one argument" args where - cmdName = if isGlobal then "names.global" else "names" + description + | isGlobal = "Iteratively search across all projects and branches for names matching `foo`. Note that this is expected to be quite slow and is primarily for debugging issues with your codebase." + | otherwise = "List all known names for `foo` in the current branch." + cmdName = if isGlobal then "debug.names.global" else "names" dependents, dependencies :: InputPattern dependents = @@ -3456,7 +3459,7 @@ validInputs = mergeInputPattern, mergeCommitInputPattern, names False, -- names - names True, -- names.global + names True, -- debug.names.global namespaceDependencies, previewAdd, previewUpdate, diff --git a/unison-cli/src/Unison/CommandLine/OutputMessages.hs b/unison-cli/src/Unison/CommandLine/OutputMessages.hs index d061f37f54..62ed7ea70e 100644 --- a/unison-cli/src/Unison/CommandLine/OutputMessages.hs +++ b/unison-cli/src/Unison/CommandLine/OutputMessages.hs @@ -855,49 +855,24 @@ notifyUser dir = \case ] ListOfDefinitions fscope ppe detailed results -> listOfDefinitions fscope ppe detailed results - ListNames global len types terms -> - if null types && null terms - then - pure . P.callout "😶" $ - P.sepNonEmpty "\n\n" $ - [ P.wrap "I couldn't find anything by that name.", - globalTip - ] - else - pure . P.sepNonEmpty "\n\n" $ - [ formatTypes types, - formatTerms terms, - globalTip - ] - where - globalTip = - if global - then mempty - else (tip $ "Use " <> IP.makeExample (IP.names True) [] <> " to see more results.") - formatTerms tms = - P.lines . P.nonEmpty $ P.plural tms (P.blue "Term") : List.intersperse "" (go <$> tms) - where - go (ref, hqs) = - P.column2 - [ ("Hash:", P.syntaxToColor (prettyReferent len ref)), - ( "Names: ", - P.group $ - P.spaced $ - P.bold . P.syntaxToColor . prettyHashQualified' <$> List.sortBy Name.compareAlphabetical hqs - ) - ] - formatTypes types = - P.lines . P.nonEmpty $ P.plural types (P.blue "Type") : List.intersperse "" (go <$> types) - where - go (ref, hqs) = - P.column2 - [ ("Hash:", P.syntaxToColor (prettyReference len ref)), - ( "Names:", - P.group $ - P.spaced $ - P.bold . P.syntaxToColor . prettyHashQualified' <$> List.sortBy Name.compareAlphabetical hqs - ) - ] + GlobalFindBranchResults projBranchName ppe detailed results -> do + output <- listOfDefinitions Input.FindGlobal ppe detailed results + pure $ + P.lines + [ P.wrap $ "Found results in " <> P.text (into @Text projBranchName), + "", + output + ] + ListNames len types terms -> + listOfNames len types terms + GlobalListNames projectBranchName len types terms -> do + output <- listOfNames len types terms + pure $ + P.lines + [ P.wrap $ "Found results in " <> P.text (into @Text projectBranchName), + "", + output + ] -- > names foo -- Terms: -- Hash: #asdflkjasdflkjasdf @@ -997,7 +972,6 @@ notifyUser dir = \case -- defs in the codebase. In some cases it's fine for bindings to -- shadow codebase names, but you don't want it to capture them in -- the decompiled output. - let prettyBindings = P.bracket . P.lines $ P.wrap "The watch expression(s) reference these definitions:" @@ -2816,6 +2790,45 @@ listOfDefinitions :: listOfDefinitions fscope ppe detailed results = pure $ listOfDefinitions' fscope ppe detailed results +listOfNames :: Int -> [(Reference, [HQ'.HashQualified Name])] -> [(Referent, [HQ'.HashQualified Name])] -> IO Pretty +listOfNames len types terms = do + if null types && null terms + then + pure . P.callout "😶" $ + P.sepNonEmpty "\n\n" $ + [ P.wrap "I couldn't find anything by that name." + ] + else + pure . P.sepNonEmpty "\n\n" $ + [ formatTypes types, + formatTerms terms + ] + where + formatTerms tms = + P.lines . P.nonEmpty $ P.plural tms (P.blue "Term") : List.intersperse "" (go <$> tms) + where + go (ref, hqs) = + P.column2 + [ ("Hash:", P.syntaxToColor (prettyReferent len ref)), + ( "Names: ", + P.group $ + P.spaced $ + P.bold . P.syntaxToColor . prettyHashQualified' <$> List.sortBy Name.compareAlphabetical hqs + ) + ] + formatTypes types = + P.lines . P.nonEmpty $ P.plural types (P.blue "Type") : List.intersperse "" (go <$> types) + where + go (ref, hqs) = + P.column2 + [ ("Hash:", P.syntaxToColor (prettyReference len ref)), + ( "Names:", + P.group $ + P.spaced $ + P.bold . P.syntaxToColor . prettyHashQualified' <$> List.sortBy Name.compareAlphabetical hqs + ) + ] + data ShowNumbers = ShowNumbers | HideNumbers -- | `ppe` is just for rendering type signatures From e545e0b1a7c0614aa364602ae0b19f45bf498b0e Mon Sep 17 00:00:00 2001 From: Chris Penner Date: Wed, 31 Jul 2024 15:58:28 -0700 Subject: [PATCH 36/50] Rerun transcripts --- ...ability-order-doesnt-affect-hash.output.md | 2 - unison-src/transcripts/deep-names.output.md | 8 -- .../transcripts/empty-namespaces.output.md | 4 +- unison-src/transcripts/find-command.output.md | 9 +- unison-src/transcripts/help.output.md | 122 ++++++++++-------- unison-src/transcripts/merge.output.md | 2 - unison-src/transcripts/names.output.md | 6 - unison-src/transcripts/suffixes.output.md | 2 - .../transcripts/unique-type-churn.output.md | 6 - .../update-ignores-lib-namespace.output.md | 2 - 10 files changed, 75 insertions(+), 88 deletions(-) diff --git a/unison-src/transcripts/ability-order-doesnt-affect-hash.output.md b/unison-src/transcripts/ability-order-doesnt-affect-hash.output.md index a61dd00459..d897322a99 100644 --- a/unison-src/transcripts/ability-order-doesnt-affect-hash.output.md +++ b/unison-src/transcripts/ability-order-doesnt-affect-hash.output.md @@ -45,7 +45,5 @@ scratch/main> names term1 Term Hash: #8hum58rlih Names: term1 term2 - - Tip: Use `names.global` to see more results. ``` diff --git a/unison-src/transcripts/deep-names.output.md b/unison-src/transcripts/deep-names.output.md index 114133d786..9756abc509 100644 --- a/unison-src/transcripts/deep-names.output.md +++ b/unison-src/transcripts/deep-names.output.md @@ -48,16 +48,12 @@ scratch/app1> names a Term Hash: #gjmq673r1v Names: lib.text_v1.a lib.text_v2.a - - Tip: Use `names.global` to see more results. scratch/app1> names x Term Hash: #nsmc4p1ra4 Names: lib.http_v3.x lib.http_v4.x - - Tip: Use `names.global` to see more results. ``` Our `app2` project includes the `http` library twice as direct dependencies, and once as an indirect dependency via `webutil`. @@ -102,15 +98,11 @@ scratch/app2> names a Term Hash: #gjmq673r1v Names: lib.webutil.lib.text_v1.a - - Tip: Use `names.global` to see more results. scratch/app2> names x Term Hash: #nsmc4p1ra4 Names: lib.http_v1.x lib.http_v2.x - - Tip: Use `names.global` to see more results. ``` diff --git a/unison-src/transcripts/empty-namespaces.output.md b/unison-src/transcripts/empty-namespaces.output.md index 1b598b6dd4..b1b647ecda 100644 --- a/unison-src/transcripts/empty-namespaces.output.md +++ b/unison-src/transcripts/empty-namespaces.output.md @@ -25,7 +25,7 @@ scratch/main> find.verbose No results. Check your spelling, or try using tab completion to supply command arguments. - `find.global` can be used to search outside the current + `debug.find.global` can be used to search outside the current namespace. ``` @@ -42,7 +42,7 @@ scratch/main> find mynamespace No results. Check your spelling, or try using tab completion to supply command arguments. - `find.global` can be used to search outside the current + `debug.find.global` can be used to search outside the current namespace. ``` diff --git a/unison-src/transcripts/find-command.output.md b/unison-src/transcripts/find-command.output.md index 7abbe26f0d..fde54abfd1 100644 --- a/unison-src/transcripts/find-command.output.md +++ b/unison-src/transcripts/find-command.output.md @@ -86,17 +86,14 @@ scratch/main> find baz No results. Check your spelling, or try using tab completion to supply command arguments. - `find.global` can be used to search outside the current + `debug.find.global` can be used to search outside the current namespace. ``` ``` ucm scratch/main> find.global notHere - 😶 - - No results. Check your spelling, or try using tab completion - to supply command arguments. - +⚠️ +I don't know how to find.global. Type `help` or `?` to get help. ``` diff --git a/unison-src/transcripts/help.output.md b/unison-src/transcripts/help.output.md index 248fb6b4fc..13f3c63820 100644 --- a/unison-src/transcripts/help.output.md +++ b/unison-src/transcripts/help.output.md @@ -113,6 +113,50 @@ scratch/main> help debug.file View details about the most recent successfully typechecked file. + debug.find.global + `find` lists all definitions in the + current namespace. + `find foo` lists all definitions with a + name similar to 'foo' in the + current namespace (excluding + those under 'lib'). + `find foo bar` lists all definitions with a + name similar to 'foo' or + 'bar' in the current + namespace (excluding those + under 'lib'). + `find-in namespace` lists all definitions in the + specified subnamespace. + `find-in namespace foo bar` lists all definitions with a + name similar to 'foo' or + 'bar' in the specified + subnamespace. + find.all foo lists all definitions with a + name similar to 'foo' in the + current namespace (including + one level of 'lib'). + `find-in.all namespace` lists all definitions in the + specified subnamespace + (including one level of its + 'lib'). + `find-in.all namespace foo bar` lists all definitions with a + name similar to 'foo' or + 'bar' in the specified + subnamespace (including one + level of its 'lib'). + debug.find.global foo Iteratively searches all + projects and branches and + lists all definitions with a + name similar to 'foo'. Note + that this is a very slow + operation. + + debug.names.global + `debug.names.global foo` Iteratively search across all + projects and branches for names matching `foo`. Note that this + is expected to be quite slow and is primarily for debugging + issues with your codebase. + debug.numberedArgs Dump the contents of the numbered args state. @@ -269,9 +313,12 @@ scratch/main> help 'bar' in the specified subnamespace (including one level of its 'lib'). - find.global foo lists all definitions with a - name similar to 'foo' in any - namespace + debug.find.global foo Iteratively searches all + projects and branches and + lists all definitions with a + name similar to 'foo'. Note + that this is a very slow + operation. find-in `find` lists all definitions in the @@ -304,9 +351,12 @@ scratch/main> help 'bar' in the specified subnamespace (including one level of its 'lib'). - find.global foo lists all definitions with a - name similar to 'foo' in any - namespace + debug.find.global foo Iteratively searches all + projects and branches and + lists all definitions with a + name similar to 'foo'. Note + that this is a very slow + operation. find-in.all `find` lists all definitions in the @@ -339,9 +389,12 @@ scratch/main> help 'bar' in the specified subnamespace (including one level of its 'lib'). - find.global foo lists all definitions with a - name similar to 'foo' in any - namespace + debug.find.global foo Iteratively searches all + projects and branches and + lists all definitions with a + name similar to 'foo'. Note + that this is a very slow + operation. find.all `find` lists all definitions in the @@ -374,48 +427,16 @@ scratch/main> help 'bar' in the specified subnamespace (including one level of its 'lib'). - find.global foo lists all definitions with a - name similar to 'foo' in any - namespace + debug.find.global foo Iteratively searches all + projects and branches and + lists all definitions with a + name similar to 'foo'. Note + that this is a very slow + operation. find.all.verbose `find.all.verbose` searches for definitions like `find.all`, but includes hashes and aliases in the results. - find.global - `find` lists all definitions in the - current namespace. - `find foo` lists all definitions with a - name similar to 'foo' in the - current namespace (excluding - those under 'lib'). - `find foo bar` lists all definitions with a - name similar to 'foo' or - 'bar' in the current - namespace (excluding those - under 'lib'). - `find-in namespace` lists all definitions in the - specified subnamespace. - `find-in namespace foo bar` lists all definitions with a - name similar to 'foo' or - 'bar' in the specified - subnamespace. - find.all foo lists all definitions with a - name similar to 'foo' in the - current namespace (including - one level of 'lib'). - `find-in.all namespace` lists all definitions in the - specified subnamespace - (including one level of its - 'lib'). - `find-in.all namespace foo bar` lists all definitions with a - name similar to 'foo' or - 'bar' in the specified - subnamespace (including one - level of its 'lib'). - find.global foo lists all definitions with a - name similar to 'foo' in any - namespace - find.verbose `find.verbose` searches for definitions like `find`, but includes hashes and aliases in the results. @@ -526,11 +547,8 @@ scratch/main> help `move.type foo bar` renames `foo` to `bar`. names - `names foo` shows the hash and all known names for `foo`. - - names.global - `names.global foo` shows the hash and all known names for - `foo`. + `names foo` List all known names for `foo` in the current + branch. namespace.dependencies List the external dependencies of the specified namespace. diff --git a/unison-src/transcripts/merge.output.md b/unison-src/transcripts/merge.output.md index 77350b1130..7675b0f748 100644 --- a/unison-src/transcripts/merge.output.md +++ b/unison-src/transcripts/merge.output.md @@ -1435,8 +1435,6 @@ project/alice> names A Type Hash: #65mdg7015r Names: A A.inner.X - - Tip: Use `names.global` to see more results. ``` Bob's branch: diff --git a/unison-src/transcripts/names.output.md b/unison-src/transcripts/names.output.md index 27b986afb0..78d1f5c9f1 100644 --- a/unison-src/transcripts/names.output.md +++ b/unison-src/transcripts/names.output.md @@ -59,8 +59,6 @@ scratch/main> names x Hash: #pi25gcdv0o Names: some.otherplace.x - - Tip: Use `names.global` to see more results. -- We can search by hash, and see all aliases of that hash scratch/main> names #gjmq673r1v @@ -68,8 +66,6 @@ scratch/main> names #gjmq673r1v Term Hash: #gjmq673r1v Names: some.otherplace.y some.place.x somewhere.z - - Tip: Use `names.global` to see more results. -- Works with absolute names too scratch/main> names .some.place.x @@ -77,8 +73,6 @@ scratch/main> names .some.place.x Term Hash: #gjmq673r1v Names: some.otherplace.y some.place.x somewhere.z - - Tip: Use `names.global` to see more results. ``` `names.global` searches from the root, and absolutely qualifies results diff --git a/unison-src/transcripts/suffixes.output.md b/unison-src/transcripts/suffixes.output.md index 43aa678efd..a4cd5e3b02 100644 --- a/unison-src/transcripts/suffixes.output.md +++ b/unison-src/transcripts/suffixes.output.md @@ -165,8 +165,6 @@ scratch/main> names distributed.lib.baz.qux Term Hash: #nhup096n2s Names: lib.distributed.lib.baz.qux - - Tip: Use `names.global` to see more results. ``` ## Corner cases diff --git a/unison-src/transcripts/unique-type-churn.output.md b/unison-src/transcripts/unique-type-churn.output.md index ea00586436..661b0b65dd 100644 --- a/unison-src/transcripts/unique-type-churn.output.md +++ b/unison-src/transcripts/unique-type-churn.output.md @@ -60,8 +60,6 @@ scratch/main> names A Term Hash: #uj8oalgadr#0 Names: A.A - - Tip: Use `names.global` to see more results. ``` ``` unison @@ -99,8 +97,6 @@ scratch/main> names A Term Hash: #ufo5tuc7ho#0 Names: A.A - - Tip: Use `names.global` to see more results. ``` ``` unison @@ -140,7 +136,5 @@ scratch/main> names A Term Hash: #uj8oalgadr#0 Names: A.A - - Tip: Use `names.global` to see more results. ``` diff --git a/unison-src/transcripts/update-ignores-lib-namespace.output.md b/unison-src/transcripts/update-ignores-lib-namespace.output.md index dc03596d08..a91ca27840 100644 --- a/unison-src/transcripts/update-ignores-lib-namespace.output.md +++ b/unison-src/transcripts/update-ignores-lib-namespace.output.md @@ -62,7 +62,5 @@ scratch/main> names foo Term Hash: #9ntnotdp87 Names: foo - - Tip: Use `names.global` to see more results. ``` From f9db384df181ebfee974d9d2f574a417767189fc Mon Sep 17 00:00:00 2001 From: Chris Penner Date: Wed, 31 Jul 2024 16:54:10 -0700 Subject: [PATCH 37/50] Fix transcripts --- unison-src/transcripts/find-command.md | 7 +--- unison-src/transcripts/find-command.output.md | 18 ++++----- unison-src/transcripts/names.md | 13 +++---- unison-src/transcripts/names.output.md | 37 +++++++++++++++---- 4 files changed, 44 insertions(+), 31 deletions(-) diff --git a/unison-src/transcripts/find-command.md b/unison-src/transcripts/find-command.md index 019903556a..56958476a5 100644 --- a/unison-src/transcripts/find-command.md +++ b/unison-src/transcripts/find-command.md @@ -34,15 +34,10 @@ Finding within a namespace ```ucm scratch/main> find bar --- Shows UUIDs --- scratch/main> find.global bar +scratch/other> debug.find.global bar scratch/main> find-in somewhere bar ``` ```ucm:error scratch/main> find baz ``` - -```ucm:error -scratch/main> find.global notHere -``` diff --git a/unison-src/transcripts/find-command.output.md b/unison-src/transcripts/find-command.output.md index fde54abfd1..4d3af86ad6 100644 --- a/unison-src/transcripts/find-command.output.md +++ b/unison-src/transcripts/find-command.output.md @@ -65,8 +65,15 @@ scratch/main> find bar 1. somewhere.bar : Nat --- Shows UUIDs --- scratch/main> find.global bar +scratch/other> debug.find.global bar + + Found results in scratch/main + + 1. .cat.lib.bar : Nat + 2. .lib.bar : Nat + 3. .somewhere.bar : Nat + + scratch/main> find-in somewhere bar 1. bar : Nat @@ -90,10 +97,3 @@ scratch/main> find baz namespace. ``` -``` ucm -scratch/main> find.global notHere - -⚠️ -I don't know how to find.global. Type `help` or `?` to get help. - -``` diff --git a/unison-src/transcripts/names.md b/unison-src/transcripts/names.md index 7780292f42..486ff35ec1 100644 --- a/unison-src/transcripts/names.md +++ b/unison-src/transcripts/names.md @@ -32,16 +32,13 @@ scratch/main> names #gjmq673r1v scratch/main> names .some.place.x ``` -`names.global` searches from the root, and absolutely qualifies results +`debug.names.global` searches from the root, and absolutely qualifies results - -TODO: swap this back to a 'ucm' block when names.global is re-implemented - -``` +```ucm -- We can search from a different branch and find all names in the codebase named 'x', and each of their aliases respectively. -scratch/other> names.global x +scratch/other> debug.names.global x -- We can search by hash, and see all aliases of that hash in the codebase -scratch/other> names.global #gjmq673r1v +scratch/other> debug.names.global #gjmq673r1v -- We can search using an absolute name -scratch/other> names.global .some.place.x +scratch/other> debug.names.global .some.place.x ``` diff --git a/unison-src/transcripts/names.output.md b/unison-src/transcripts/names.output.md index 78d1f5c9f1..06db804432 100644 --- a/unison-src/transcripts/names.output.md +++ b/unison-src/transcripts/names.output.md @@ -75,16 +75,37 @@ scratch/main> names .some.place.x Names: some.otherplace.y some.place.x somewhere.z ``` -`names.global` searches from the root, and absolutely qualifies results +`debug.names.global` searches from the root, and absolutely qualifies results -TODO: swap this back to a 'ucm' block when names.global is re-implemented - -``` +``` ucm -- We can search from a different branch and find all names in the codebase named 'x', and each of their aliases respectively. -scratch/other> names.global x +scratch/other> debug.names.global x + + Found results in scratch/main + + Terms + Hash: #gjmq673r1v + Names: some.otherplace.y some.place.x somewhere.z + + Hash: #pi25gcdv0o + Names: some.otherplace.x + -- We can search by hash, and see all aliases of that hash in the codebase -scratch/other> names.global #gjmq673r1v +scratch/other> debug.names.global #gjmq673r1v + + Found results in scratch/main + + Term + Hash: #gjmq673r1v + Names: some.otherplace.y some.place.x somewhere.z + -- We can search using an absolute name -scratch/other> names.global .some.place.x -``` +scratch/other> debug.names.global .some.place.x + Found results in scratch/main + + Term + Hash: #gjmq673r1v + Names: some.otherplace.y some.place.x somewhere.z + +``` From 8c9c3baad81d75ae68eecc4bde32de0309ac3a6c Mon Sep 17 00:00:00 2001 From: ChrisPenner Date: Thu, 1 Aug 2024 00:17:33 +0000 Subject: [PATCH 38/50] automatically run ormolu --- unison-cli/src/Unison/CommandLine/OutputMessages.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/unison-cli/src/Unison/CommandLine/OutputMessages.hs b/unison-cli/src/Unison/CommandLine/OutputMessages.hs index 62ed7ea70e..0bd733b88a 100644 --- a/unison-cli/src/Unison/CommandLine/OutputMessages.hs +++ b/unison-cli/src/Unison/CommandLine/OutputMessages.hs @@ -972,6 +972,7 @@ notifyUser dir = \case -- defs in the codebase. In some cases it's fine for bindings to -- shadow codebase names, but you don't want it to capture them in -- the decompiled output. + let prettyBindings = P.bracket . P.lines $ P.wrap "The watch expression(s) reference these definitions:" From 4803d446f1e361cf33c85daf698c6d58e31c796d Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 1 Aug 2024 10:51:32 -0600 Subject: [PATCH 39/50] Add some description to the new transcripts --- unison-src/transcripts/fix1327.md | 4 ++++ unison-src/transcripts/fix1327.output.md | 4 ++++ unison-src/transcripts/fix3977.md | 2 ++ unison-src/transcripts/fix3977.output.md | 2 ++ 4 files changed, 12 insertions(+) diff --git a/unison-src/transcripts/fix1327.md b/unison-src/transcripts/fix1327.md index 764d0f3ac5..45c1e11e92 100644 --- a/unison-src/transcripts/fix1327.md +++ b/unison-src/transcripts/fix1327.md @@ -4,6 +4,10 @@ foo = 4 bar = 5 ``` +`alias.many` should be able to consume the numbered args produced by `ls`. Previously, `ls` would produce absolute paths, but `alias.many` required relative ones. + +Now `ls` returns a pair of the absolute search directory and the result relative to that search directory, so it can be used in both absolute and relative contexts. + ```ucm scratch/main> add scratch/main> ls diff --git a/unison-src/transcripts/fix1327.output.md b/unison-src/transcripts/fix1327.output.md index fa542e6ed2..9e0234725a 100644 --- a/unison-src/transcripts/fix1327.output.md +++ b/unison-src/transcripts/fix1327.output.md @@ -18,6 +18,10 @@ bar = 5 foo : ##Nat ``` +`alias.many` should be able to consume the numbered args produced by `ls`. Previously, `ls` would produce absolute paths, but `alias.many` required relative ones. + +Now `ls` returns a pair of the absolute search directory and the result relative to that search directory, so it can be used in both absolute and relative contexts. + ``` ucm scratch/main> add diff --git a/unison-src/transcripts/fix3977.md b/unison-src/transcripts/fix3977.md index 8ad82cbce9..fc1fc1c718 100644 --- a/unison-src/transcripts/fix3977.md +++ b/unison-src/transcripts/fix3977.md @@ -2,6 +2,8 @@ scratch/main> builtins.merge ``` +Pretty-printing previously didn’t compensate for extra characters on a line that was about to be wrapped, resulting in a line-break without sufficient indentation. Now pretty-printing indents based on the starting column of the wrapped expression, not simply “prevIndent + 2”. + ```unison:hide failure msg context = Failure (typeLink Unit) msg (Any context) diff --git a/unison-src/transcripts/fix3977.output.md b/unison-src/transcripts/fix3977.output.md index 79a68eedc4..d4451d8c94 100644 --- a/unison-src/transcripts/fix3977.output.md +++ b/unison-src/transcripts/fix3977.output.md @@ -1,3 +1,5 @@ +Pretty-printing previously didn’t compensate for extra characters on a line that was about to be wrapped, resulting in a line-break without sufficient indentation. Now pretty-printing indents based on the starting column of the wrapped expression, not simply “prevIndent + 2”. + ``` unison failure msg context = Failure (typeLink Unit) msg (Any context) From 9100b97e91a3dd8df44038162072082162ec4c9a Mon Sep 17 00:00:00 2001 From: Arya Irani Date: Thu, 1 Aug 2024 17:49:45 -0400 Subject: [PATCH 40/50] tweak output messages --- .../Unison/Codebase/Editor/HandleInput/Update2.hs | 4 ++-- unison-cli/src/Unison/CommandLine/OutputMessages.hs | 13 +++++++------ unison-src/transcripts/update-on-conflict.output.md | 5 +++-- .../update-suffixifies-properly.output.md | 4 ++-- ...-term-with-dependent-to-different-type.output.md | 4 ++-- .../update-test-watch-roundtrip.output.md | 4 ++-- ...type-delete-constructor-with-dependent.output.md | 4 ++-- .../update-type-delete-record-field.output.md | 4 ++-- .../update-type-with-dependent-term.output.md | 4 ++-- ...-with-dependent-type-to-different-kind.output.md | 4 ++-- 10 files changed, 26 insertions(+), 24 deletions(-) diff --git a/unison-cli/src/Unison/Codebase/Editor/HandleInput/Update2.hs b/unison-cli/src/Unison/Codebase/Editor/HandleInput/Update2.hs index 858003c431..f2650da4d3 100644 --- a/unison-cli/src/Unison/Codebase/Editor/HandleInput/Update2.hs +++ b/unison-cli/src/Unison/Codebase/Editor/HandleInput/Update2.hs @@ -156,9 +156,9 @@ makePrettyUnisonFile originalFile dependents = originalFile <> Pretty.newline <> Pretty.newline - <> "-- The definitions below are not compatible with the updated definitions above." + <> "-- The definitions below no longer typecheck with the changes above." <> Pretty.newline - <> "-- Please fix the errors and run `update` again." + <> "-- Please fix the errors and try `update` again." <> Pretty.newline <> Pretty.newline <> ( dependents diff --git a/unison-cli/src/Unison/CommandLine/OutputMessages.hs b/unison-cli/src/Unison/CommandLine/OutputMessages.hs index 0194da8ea1..f4aaac02e7 100644 --- a/unison-cli/src/Unison/CommandLine/OutputMessages.hs +++ b/unison-cli/src/Unison/CommandLine/OutputMessages.hs @@ -2095,11 +2095,12 @@ notifyUser dir = \case <> P.text filename ConflictedDefn operation defn -> pure . P.wrap $ - ( case defn of - TermDefn (Conflicted name _refs) -> "The term name" <> prettyName name <> "is ambiguous." - TypeDefn (Conflicted name _refs) -> "The type name" <> prettyName name <> "is ambiguous." + ( "This branch has more than one" <> case defn of + TermDefn (Conflicted name _refs) -> "term with the name" <> P.group (P.backticked (prettyName name) <> ".") + TypeDefn (Conflicted name _refs) -> "type with the name" <> P.group (P.backticked (prettyName name) <> ".") ) - <> "Please resolve the ambiguity, then try to" + <> P.newline + <> "Please delete or rename all but one of them, then try the" <> P.text operation <> "again." IncoherentDeclDuringMerge aliceOrBob reason -> @@ -2614,7 +2615,7 @@ renderNameConflicts hashLen conflictedNames = do prettyConflictedTerms <- showConflictedNames "term" conflictedTermNames pure $ Monoid.unlessM (null allConflictedNames) $ - P.callout "❓" . P.sep "\n\n" . P.nonEmpty $ + P.callout "❓" . P.linesSpaced . P.nonEmpty $ [ prettyConflictedTypes, prettyConflictedTerms, tip $ @@ -2635,7 +2636,7 @@ renderNameConflicts hashLen conflictedNames = do where showConflictedNames :: Pretty -> Map Name [HQ.HashQualified Name] -> Numbered Pretty showConflictedNames thingKind conflictedNames = - P.lines <$> do + P.linesSpaced <$> do for (Map.toList conflictedNames) \(name, hashes) -> do prettyConflicts <- for hashes \hash -> do n <- addNumberedArg $ SA.HashQualified hash diff --git a/unison-src/transcripts/update-on-conflict.output.md b/unison-src/transcripts/update-on-conflict.output.md index d2a5f2de22..9beda9810c 100644 --- a/unison-src/transcripts/update-on-conflict.output.md +++ b/unison-src/transcripts/update-on-conflict.output.md @@ -59,7 +59,8 @@ x = 3 ``` ucm scratch/main> update - The term name x is ambiguous. Please resolve the ambiguity, - then try to update again. + This branch has more than one term with the name `x`. Please + delete or rename all but one of them, then try the update + again. ``` diff --git a/unison-src/transcripts/update-suffixifies-properly.output.md b/unison-src/transcripts/update-suffixifies-properly.output.md index 8e71e3a904..e8a30e7f38 100644 --- a/unison-src/transcripts/update-suffixifies-properly.output.md +++ b/unison-src/transcripts/update-suffixifies-properly.output.md @@ -72,8 +72,8 @@ myproject/main> update ``` unison:added-by-ucm scratch.u foo = +30 --- The definitions below are not compatible with the updated definitions above. --- Please fix the errors and run `update` again. +-- The definitions below no longer typecheck with the changes above. +-- Please fix the errors and try `update` again. bar : Nat bar = diff --git a/unison-src/transcripts/update-term-with-dependent-to-different-type.output.md b/unison-src/transcripts/update-term-with-dependent-to-different-type.output.md index 646d559fbd..c1737627d4 100644 --- a/unison-src/transcripts/update-term-with-dependent-to-different-type.output.md +++ b/unison-src/transcripts/update-term-with-dependent-to-different-type.output.md @@ -71,8 +71,8 @@ scratch/main> update foo : Int foo = +5 --- The definitions below are not compatible with the updated definitions above. --- Please fix the errors and run `update` again. +-- The definitions below no longer typecheck with the changes above. +-- Please fix the errors and try `update` again. bar : Nat bar = diff --git a/unison-src/transcripts/update-test-watch-roundtrip.output.md b/unison-src/transcripts/update-test-watch-roundtrip.output.md index 9caf54c2f3..45ddaaa3f8 100644 --- a/unison-src/transcripts/update-test-watch-roundtrip.output.md +++ b/unison-src/transcripts/update-test-watch-roundtrip.output.md @@ -53,8 +53,8 @@ scratch/main> update ``` unison:added-by-ucm scratch.u foo n = "hello, world!" --- The definitions below are not compatible with the updated definitions above. --- Please fix the errors and run `update` again. +-- The definitions below no longer typecheck with the changes above. +-- Please fix the errors and try `update` again. test> mynamespace.foo.test = n = 2 diff --git a/unison-src/transcripts/update-type-delete-constructor-with-dependent.output.md b/unison-src/transcripts/update-type-delete-constructor-with-dependent.output.md index eee51c7060..085d0826a7 100644 --- a/unison-src/transcripts/update-type-delete-constructor-with-dependent.output.md +++ b/unison-src/transcripts/update-type-delete-constructor-with-dependent.output.md @@ -67,8 +67,8 @@ scratch/main> update ``` unison:added-by-ucm scratch.u type Foo = Bar Nat --- The definitions below are not compatible with the updated definitions above. --- Please fix the errors and run `update` again. +-- The definitions below no longer typecheck with the changes above. +-- Please fix the errors and try `update` again. foo : Foo -> Nat foo = cases diff --git a/unison-src/transcripts/update-type-delete-record-field.output.md b/unison-src/transcripts/update-type-delete-record-field.output.md index fe6038c1c3..fb3f7a3c99 100644 --- a/unison-src/transcripts/update-type-delete-record-field.output.md +++ b/unison-src/transcripts/update-type-delete-record-field.output.md @@ -106,8 +106,8 @@ scratch/main> find.verbose ``` unison:added-by-ucm scratch.u type Foo = { bar : Nat } --- The definitions below are not compatible with the updated definitions above. --- Please fix the errors and run `update` again. +-- The definitions below no longer typecheck with the changes above. +-- Please fix the errors and try `update` again. Foo.baz : Foo -> Int Foo.baz = cases Foo _ baz -> baz diff --git a/unison-src/transcripts/update-type-with-dependent-term.output.md b/unison-src/transcripts/update-type-with-dependent-term.output.md index e200341bb1..c334a5e853 100644 --- a/unison-src/transcripts/update-type-with-dependent-term.output.md +++ b/unison-src/transcripts/update-type-with-dependent-term.output.md @@ -62,8 +62,8 @@ scratch/main> update ``` unison:added-by-ucm scratch.u type Foo = Bar Nat Nat --- The definitions below are not compatible with the updated definitions above. --- Please fix the errors and run `update` again. +-- The definitions below no longer typecheck with the changes above. +-- Please fix the errors and try `update` again. incrFoo : Foo -> Foo incrFoo = cases Bar n -> Bar (n Nat.+ 1) diff --git a/unison-src/transcripts/update-type-with-dependent-type-to-different-kind.output.md b/unison-src/transcripts/update-type-with-dependent-type-to-different-kind.output.md index dcb3c96d24..bff59176e3 100644 --- a/unison-src/transcripts/update-type-with-dependent-type-to-different-kind.output.md +++ b/unison-src/transcripts/update-type-with-dependent-type-to-different-kind.output.md @@ -60,8 +60,8 @@ scratch/main> update ``` unison:added-by-ucm scratch.u type Foo a = Bar Nat a --- The definitions below are not compatible with the updated definitions above. --- Please fix the errors and run `update` again. +-- The definitions below no longer typecheck with the changes above. +-- Please fix the errors and try `update` again. type Baz = Qux Foo From c5a66d5608849dbe2f4d4b9164cae34ade4c6ca1 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 29 Jul 2024 16:49:21 -0600 Subject: [PATCH 41/50] Simplify Doc parser from `State` to `Reader` --- .../src/Unison/Syntax/Lexer/Unison.hs | 2 +- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 44 +++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs index 8a6c20d1a8..c641786505 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs @@ -355,7 +355,7 @@ doc2 = do (docTok, closeTok) <- local (\env -> env {inLayout = False}) do - body <- Doc.doc typeOrTerm lexemes' . P.lookAhead $ () <$ lit "}}" + body <- Doc.doc typeOrTerm lexemes' . P.lookAhead $ lit "}}" closeStart <- posP lit "}}" closeEnd <- posP diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs index 8ba6840dd2..1a03665493 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -5,12 +5,12 @@ -- -- - an identifer parser -- - a code parser (that accepts a termination parser) --- - a termination parser (only used for lookahead), for this parser to know when to give up +-- - a termination parser, for this parser to know when to give up -- -- Each of those parsers is expected to satisfy @(`Ord` e, `P.MonadParsec` e `String` m)@. module Unison.Syntax.Parser.Doc ( Tree, - initialState, + initialEnv, doc, untitledSection, sectionElem, @@ -52,7 +52,7 @@ module Unison.Syntax.Parser.Doc where import Control.Comonad.Cofree (Cofree ((:<))) -import Control.Monad.State qualified as S +import Control.Monad.Reader qualified as R import Data.Char (isControl, isSpace) import Data.List qualified as List import Data.List.Extra qualified as List @@ -64,7 +64,7 @@ import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Unison.Parser.Ann (Ann, Annotated (..)) import Unison.Prelude hiding (join) -import Unison.Syntax.Lexer (column, line, lit, local, sepBy1', some', someTill', (<+>)) +import Unison.Syntax.Lexer (column, line, lit, sepBy1', some', someTill', (<+>)) import Unison.Syntax.Lexer.Token (Token (Token), posP, tokenP) import Unison.Syntax.Parser.Doc.Data @@ -79,16 +79,16 @@ data ParsingEnv = ParsingEnv } deriving (Show) -initialState :: ParsingEnv -initialState = ParsingEnv [0] 0 +initialEnv :: ParsingEnv +initialEnv = ParsingEnv [0] 0 doc :: (Ord e, P.MonadParsec e String m, Annotated code) => m ident -> (m () -> m code) -> - m () -> + m end -> m (UntitledSection (Tree ident code)) -doc ident code = flip S.evalStateT initialState . untitledSection . sectionElem ident code +doc ident code = flip R.runReaderT initialEnv . untitledSection . sectionElem ident code . void -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). @@ -100,7 +100,7 @@ sectionElem :: m ident -> (m () -> m code) -> m () -> - S.StateT ParsingEnv m (Tree ident code) + R.ReaderT ParsingEnv m (Tree ident code) sectionElem ident code docClose = fmap wrap' $ section ident code docClose @@ -390,13 +390,13 @@ list :: m ident -> (m () -> m code) -> m () -> - S.StateT ParsingEnv m (Top ident code (Tree ident code)) + R.ReaderT ParsingEnv m (Top ident code (Tree ident code)) list ident code docClose = bulletedList ident code docClose <|> numberedList ident code docClose -listSep :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m () +listSep :: (Ord e, R.MonadReader ParsingEnv m, P.MonadParsec e String m) => m () listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) -bulletedStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m (Int, [a]) +bulletedStart :: (Ord e, R.MonadReader ParsingEnv m, P.MonadParsec e String m) => m (Int, [a]) bulletedStart = P.try $ do r <- listItemStart $ [] <$ P.satisfy bulletChar P.lookAhead (P.satisfy isSpace) @@ -404,15 +404,15 @@ bulletedStart = P.try $ do where bulletChar ch = ch == '*' || ch == '-' || ch == '+' -listItemStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m a -> m (Int, a) +listItemStart :: (Ord e, R.MonadReader ParsingEnv m, P.MonadParsec e String m) => m a -> m (Int, a) listItemStart gutter = P.try do nonNewlineSpaces col <- column <$> posP - parentCol <- S.gets parentListColumn + parentCol <- R.asks parentListColumn guard (col > parentCol) (col,) <$> gutter -numberedStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m (Int, Token Word64) +numberedStart :: (Ord e, R.MonadReader ParsingEnv m, P.MonadParsec e String m) => m (Int, Token Word64) numberedStart = listItemStart $ P.try (tokenP $ LP.decimal <* lit ".") -- | FIXME: This should take a @`P` a@ @@ -421,7 +421,7 @@ numberedList :: m ident -> (m () -> m code) -> m () -> - S.StateT ParsingEnv m (Top ident code (Tree ident code)) + R.ReaderT ParsingEnv m (Top ident code (Tree ident code)) numberedList ident code docClose = NumberedList <$> sepBy1' numberedItem listSep where numberedItem = P.label "numbered list (examples: 1. item1, 8. start numbering at '8')" do @@ -434,7 +434,7 @@ bulletedList :: m ident -> (m () -> m code) -> m () -> - S.StateT ParsingEnv m (Top ident code (Tree ident code)) + R.ReaderT ParsingEnv m (Top ident code (Tree ident code)) bulletedList ident code docClose = BulletedList <$> sepBy1' bullet listSep where bullet = P.label "bullet (examples: * item1, - item2)" do @@ -447,11 +447,11 @@ column' :: (m () -> m code) -> m () -> Int -> - S.StateT ParsingEnv m (Column (Tree ident code)) + R.ReaderT ParsingEnv m (Column (Tree ident code)) column' ident code docClose col = Column . wrap' <$> (nonNewlineSpaces *> listItemParagraph) - <*> local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> fmap wrap' (list ident code docClose)) + <*> R.local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> fmap wrap' (list ident code docClose)) where listItemParagraph = Paragraph <$> do @@ -497,14 +497,14 @@ section :: m ident -> (m () -> m code) -> m () -> - S.StateT ParsingEnv m (Top ident code (Tree ident code)) + R.ReaderT ParsingEnv m (Top ident code (Tree ident code)) section ident code docClose = do - ns <- S.gets parentSections + ns <- R.asks parentSections hashes <- lift $ P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp docClose title <- lift $ paragraph ident code docClose <* CP.space let m = length hashes + head ns body <- - local (\env -> env {parentSections = (m : (tail ns))}) $ + R.local (\env -> env {parentSections = m : tail ns}) $ P.many (sectionElem ident code docClose <* CP.space) pure $ Section (wrap' title) body From 96f865b37c4fcb31928d976ece21dbf3231d8e87 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 1 Aug 2024 22:53:33 -0600 Subject: [PATCH 42/50] Add a transcript showing that #5076 was fixed Some handling of blocks without final newlines was improved in the course of this PR. Fixes #5076. --- unison-src/transcripts/fix5076.md | 12 ++++++++++++ unison-src/transcripts/fix5076.output.md | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 unison-src/transcripts/fix5076.md create mode 100644 unison-src/transcripts/fix5076.output.md diff --git a/unison-src/transcripts/fix5076.md b/unison-src/transcripts/fix5076.md new file mode 100644 index 0000000000..d2c4b5a7b2 --- /dev/null +++ b/unison-src/transcripts/fix5076.md @@ -0,0 +1,12 @@ +```ucm:hide +scratch/main> builtins.mergeio +``` + +Nested call to code lexer wasn’t terminating inline examples containing blocks properly. + +```unison +x = {{ + ``let "me"`` live + ``do "me"`` in + }} +``` diff --git a/unison-src/transcripts/fix5076.output.md b/unison-src/transcripts/fix5076.output.md new file mode 100644 index 0000000000..f92954cd23 --- /dev/null +++ b/unison-src/transcripts/fix5076.output.md @@ -0,0 +1,22 @@ +Nested call to code lexer wasn’t terminating inline examples containing blocks properly. + +``` unison +x = {{ + ``let "me"`` live + ``do "me"`` in + }} +``` + +``` ucm + + Loading changes detected in scratch.u. + + I found and typechecked these definitions in scratch.u. If you + do an `add` or `update`, here's how your codebase would + change: + + ⍟ These new definitions are ok to `add`: + + x : Doc2 + +``` From ac75905f8b44bf58a2d00fd30a675d8fe0f7d4fc Mon Sep 17 00:00:00 2001 From: Arya Irani <538571+aryairani@users.noreply.github.com> Date: Fri, 2 Aug 2024 13:44:31 -0400 Subject: [PATCH 43/50] debug nix-dev-cache --- .github/workflows/nix-dev-cache.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/nix-dev-cache.yaml b/.github/workflows/nix-dev-cache.yaml index 9ee02af326..0a48c953f3 100644 --- a/.github/workflows/nix-dev-cache.yaml +++ b/.github/workflows/nix-dev-cache.yaml @@ -36,5 +36,7 @@ jobs: with: name: unison authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 - name: build all packages and development shells run: nix -L build --accept-flake-config --no-link --keep-going '.#all' From 3e87dc3854e4cb3a16d75f59bb4983cb71cba1d2 Mon Sep 17 00:00:00 2001 From: Chris Penner Date: Fri, 2 Aug 2024 10:34:24 -0700 Subject: [PATCH 44/50] Fix unused-binding-detection in case patterns --- .../src/Unison/Syntax/TermParser.hs | 3 +- .../Unison/LSP/FileAnalysis/UnusedBindings.hs | 67 ++++++++++++++++--- unison-cli/tests/Unison/Test/LSP.hs | 28 ++++++-- 3 files changed, 82 insertions(+), 16 deletions(-) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 635a974d89..78048c404e 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -294,12 +294,13 @@ parsePattern = label "pattern" root do _ <- anyToken; pure (Set.findMin s <$ tok) where isLower = Text.all Char.isLower . Text.take 1 . Name.toText + isIgnored n = Text.take 1 (Name.toText n) == "_" die hq s = case L.payload hq of -- if token not hash qualified or uppercase, -- fail w/out consuming it to allow backtracking HQ.NameOnly n | Set.null s - && isLower n -> + && (isLower n || isIgnored n) -> fail $ "not a constructor name: " <> show n -- it was hash qualified, and wasn't found in the env, that's a failure! _ -> failCommitted $ err hq s diff --git a/unison-cli/src/Unison/LSP/FileAnalysis/UnusedBindings.hs b/unison-cli/src/Unison/LSP/FileAnalysis/UnusedBindings.hs index 46d87c6ec1..85a3511cfd 100644 --- a/unison-cli/src/Unison/LSP/FileAnalysis/UnusedBindings.hs +++ b/unison-cli/src/Unison/LSP/FileAnalysis/UnusedBindings.hs @@ -14,20 +14,42 @@ import Unison.Parser.Ann (Ann) import Unison.Prelude import Unison.Symbol (Symbol (..)) import Unison.Term (Term) +import Unison.Term qualified as Term +import Unison.Util.List qualified as ListUtils import Unison.Util.Range qualified as Range import Unison.Var qualified as Var +data VarUsages + = VarUsages + { unusedVars :: Map Symbol (Set Ann), + usedVars :: Set Symbol, + -- This is generally a copy of usedVars, except that we _don't_ remove variables when they go out of scope. + -- This is solely so we have the information to handle an edge case in pattern guards where vars are independently + -- brought into scope in BOTH the guards and the body of a match case, and we want to count a var as used if it + -- appears in _either_. + allUsedVars :: Set Symbol + } + +instance Semigroup VarUsages where + VarUsages a b c <> VarUsages a' b' c' = + VarUsages (Map.unionWith (<>) a a') (b <> b') (c <> c') + +instance Monoid VarUsages where + mempty = VarUsages mempty mempty mempty + analyseTerm :: Lsp.Uri -> Term Symbol Ann -> [Diagnostic] analyseTerm fileUri tm = - let (unusedVars, _) = ABT.cata alg tm + let (VarUsages {unusedVars}) = ABT.cata alg tm vars = Map.toList unusedVars & mapMaybe \(v, ann) -> do (,ann) <$> getRelevantVarName v diagnostics = - vars & mapMaybe \(varName, ann) -> do + vars & foldMap \(varName, anns) -> do + ann <- Set.toList anns + range <- maybeToList $ Cv.annToURange ann -- Limit the range to the first line of the binding to not be too annoying. -- Maybe in the future we can get the actual annotation of the variable name. - lspRange <- Cv.uToLspRange . Range.startingLine <$> Cv.annToURange ann + let lspRange = Cv.uToLspRange . Range.startingLine $ range pure $ Diagnostic.mkDiagnostic fileUri lspRange Diagnostic.DiagnosticSeverity_Warning [Lsp.DiagnosticTag_Unnecessary] ("Unused binding " <> tShow varName <> ". Use the binding, or prefix it with an _ to dismiss this warning.") [] in diagnostics where @@ -41,12 +63,39 @@ analyseTerm fileUri tm = guard (not (Text.isPrefixOf "_" n)) Just n _ -> Nothing - alg :: (Foldable f, Ord v) => Ann -> ABT f v (Map v Ann, Set v) -> (Map v Ann, Set v) + alg :: + Ann -> + (ABT (Term.F Symbol Ann Ann) Symbol VarUsages -> VarUsages) alg ann abt = case abt of - Var v -> (mempty, Set.singleton v) + Var v -> VarUsages {unusedVars = mempty, usedVars = Set.singleton v, allUsedVars = Set.singleton v} Cycle x -> x - Abs v (unusedBindings, usedVars) -> + Abs v (VarUsages {unusedVars, usedVars, allUsedVars}) -> if v `Set.member` usedVars - then (unusedBindings, Set.delete v usedVars) - else (Map.insert v ann unusedBindings, usedVars) - Tm fx -> Foldable.fold fx + then VarUsages {unusedVars, usedVars = Set.delete v usedVars, allUsedVars} + else VarUsages {unusedVars = Map.insert v (Set.singleton ann) unusedVars, usedVars, allUsedVars} + Tm fx -> + case fx of + -- We need to special-case pattern guards because the pattern, guard, and body treat each of their vars in + -- their own independent scopes, even though the vars created in the pattern are the same ones used in the + -- guards and bindings :shrug: + Term.Match scrutinee cases -> + let -- There's a separate case for every guard on a single pattern, so we first do our best to group up cases with the same pattern. + -- Otherwise, a var may be reported unused in one branch of a guard even though it's used in another branch. + groupedCases = ListUtils.groupBy (\(Term.MatchCase pat _ _) -> pat) cases + caseVars = + groupedCases & foldMap \singlePatCases -> + let (VarUsages {unusedVars = unused, usedVars = used, allUsedVars = allUsed}) = + singlePatCases + & foldMap + ( \(Term.MatchCase pat guard body) -> + -- This is imprecise, but it's quite annoying to get the actual ann of the unused bindings, so + -- we just use the FULL span of the pattern for now. We could fix this with a bit + -- of elbow grease. + let patSpanAnn = fold pat + combindedVarUsages = fold guard <> body + in combindedVarUsages {unusedVars = (unusedVars combindedVarUsages) $> (Set.singleton patSpanAnn)} + ) + actuallyUnusedVars = unused & Map.filterWithKey \k _ -> k `Set.notMember` allUsed + in VarUsages {unusedVars = actuallyUnusedVars, usedVars = used, allUsedVars = allUsed} + in scrutinee <> caseVars + _ -> Foldable.fold fx diff --git a/unison-cli/tests/Unison/Test/LSP.hs b/unison-cli/tests/Unison/Test/LSP.hs index 880fd6214b..2ab406da56 100644 --- a/unison-cli/tests/Unison/Test/LSP.hs +++ b/unison-cli/tests/Unison/Test/LSP.hs @@ -416,21 +416,24 @@ withTestCodebase action = do makeDiagnosticRangeTest :: (String, Text) -> Test () makeDiagnosticRangeTest (testName, testSrc) = scope testName $ do - (ann, _block, cleanSrc) <- case extractDelimitedBlock ('«', '»') testSrc of - Nothing -> crash "expected exactly one delimited block" - Just r -> pure r + let (cleanSrc, mayExpectedDiagnostic) = case extractDelimitedBlock ('«', '»') testSrc of + Nothing -> (testSrc, Nothing) + Just (ann, block, clean) -> (clean, Just (ann, block)) (pf, _mayTypecheckedFile) <- typecheckSrc testName cleanSrc UF.terms pf & Map.elems & \case [(_a, trm)] -> do - case UnusedBindings.analyseTerm (LSP.Uri "test") trm of - [diag] -> do + case (mayExpectedDiagnostic, UnusedBindings.analyseTerm (LSP.Uri "test") trm) of + (Just (ann, _block), [diag]) -> do let expectedRange = Cv.annToRange ann let actualRange = Just (diag ^. LSP.range) when (expectedRange /= actualRange) do crash $ "Expected diagnostic at range: " <> show expectedRange <> ", got: " <> show actualRange - _ -> crash "Expected exactly one diagnostic" + (Nothing, []) -> pure () + (expected, actual) -> case expected of + Nothing -> crash $ "Expected no diagnostics, got: " <> show actual + Just _ -> crash $ "Expected exactly one diagnostic, but got " <> show actual _ -> crash "Expected exactly one term" unusedBindingLocations :: Test () @@ -446,5 +449,18 @@ unusedBindingLocations = ), ( "Unused argument", [here|term «unused» = 1|] + ), + ( "Unused binding in cases block", + [here|term = cases + -- Note: the diagnostic _should_ only wrap the unused bindings, but right now it just wraps the whole pattern. + («unused, used») + | used > 0 -> true + | otherwise -> false + |] + ), + ( "Ignored unused binding in cases block shouldn't error", + [here|term = cases + (used, _ignored) -> used + |] ) ] From a26a31a7cd0b7a1676782fa539e0601f6f896541 Mon Sep 17 00:00:00 2001 From: Arya Irani <538571+aryairani@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:13:47 -0400 Subject: [PATCH 45/50] Update nix-dev-cache.yaml --- .github/workflows/nix-dev-cache.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/nix-dev-cache.yaml b/.github/workflows/nix-dev-cache.yaml index 0a48c953f3..dc17e49263 100644 --- a/.github/workflows/nix-dev-cache.yaml +++ b/.github/workflows/nix-dev-cache.yaml @@ -24,6 +24,11 @@ jobs: - macOS-14 steps: - uses: actions/checkout@v4 + - name: set up mount points on Linux + if: runner.os == 'Linux' + run: | + mkdir /nix /mnt/nix + mount -B /mnt/nix /nix - uses: cachix/install-nix-action@v27 if: runner.os == 'Linux' with: From d12176f2bbecf01803b9d89e32c3b16d69eca4a2 Mon Sep 17 00:00:00 2001 From: Arya Irani <538571+aryairani@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:14:48 -0400 Subject: [PATCH 46/50] Update nix-dev-cache.yaml --- .github/workflows/nix-dev-cache.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nix-dev-cache.yaml b/.github/workflows/nix-dev-cache.yaml index dc17e49263..a4a51b0cce 100644 --- a/.github/workflows/nix-dev-cache.yaml +++ b/.github/workflows/nix-dev-cache.yaml @@ -27,8 +27,8 @@ jobs: - name: set up mount points on Linux if: runner.os == 'Linux' run: | - mkdir /nix /mnt/nix - mount -B /mnt/nix /nix + sudo mkdir /nix /mnt/nix + sudo mount -B /mnt/nix /nix - uses: cachix/install-nix-action@v27 if: runner.os == 'Linux' with: From d903fd240191fe9ba7f01e5db739ce48141f7d8c Mon Sep 17 00:00:00 2001 From: Arya Irani <538571+aryairani@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:32:57 -0400 Subject: [PATCH 47/50] Update nix-dev-cache.yaml --- .github/workflows/nix-dev-cache.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nix-dev-cache.yaml b/.github/workflows/nix-dev-cache.yaml index a4a51b0cce..ee13b8a64c 100644 --- a/.github/workflows/nix-dev-cache.yaml +++ b/.github/workflows/nix-dev-cache.yaml @@ -24,7 +24,7 @@ jobs: - macOS-14 steps: - uses: actions/checkout@v4 - - name: set up mount points on Linux + - name: set up nix mount points on Linux if: runner.os == 'Linux' run: | sudo mkdir /nix /mnt/nix @@ -41,7 +41,7 @@ jobs: with: name: unison authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - name: build all packages and development shells run: nix -L build --accept-flake-config --no-link --keep-going '.#all' + - name: print disk free status + run: df -h From 02f9eb7eeddf318b1ee449cb3fa8bae64b52ac40 Mon Sep 17 00:00:00 2001 From: Arya Irani <538571+aryairani@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:43:05 -0400 Subject: [PATCH 48/50] Update .github/workflows/nix-dev-cache.yaml Co-authored-by: Greg Pfeil --- .github/workflows/nix-dev-cache.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nix-dev-cache.yaml b/.github/workflows/nix-dev-cache.yaml index ee13b8a64c..df2291c8e4 100644 --- a/.github/workflows/nix-dev-cache.yaml +++ b/.github/workflows/nix-dev-cache.yaml @@ -24,7 +24,7 @@ jobs: - macOS-14 steps: - uses: actions/checkout@v4 - - name: set up nix mount points on Linux + - name: mount Nix store on larger partition if: runner.os == 'Linux' run: | sudo mkdir /nix /mnt/nix From d7ac7c60311e94f688c8cb4535cc1c0419ae6244 Mon Sep 17 00:00:00 2001 From: Arya Irani <538571+aryairani@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:48:33 -0400 Subject: [PATCH 49/50] Update .github/workflows/nix-dev-cache.yaml --- .github/workflows/nix-dev-cache.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nix-dev-cache.yaml b/.github/workflows/nix-dev-cache.yaml index df2291c8e4..4300e1bc81 100644 --- a/.github/workflows/nix-dev-cache.yaml +++ b/.github/workflows/nix-dev-cache.yaml @@ -25,6 +25,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: mount Nix store on larger partition + # on the Linux runner `/` doesn't have enough space, but there's a `/mnt` which does. if: runner.os == 'Linux' run: | sudo mkdir /nix /mnt/nix From 6cb39c8838449a69c7b4af920eb207f25eedbade Mon Sep 17 00:00:00 2001 From: Arya Irani <538571+aryairani@users.noreply.github.com> Date: Fri, 2 Aug 2024 16:00:20 -0400 Subject: [PATCH 50/50] Update .github/workflows/nix-dev-cache.yaml Co-authored-by: Greg Pfeil --- .github/workflows/nix-dev-cache.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nix-dev-cache.yaml b/.github/workflows/nix-dev-cache.yaml index 4300e1bc81..2ac6857c37 100644 --- a/.github/workflows/nix-dev-cache.yaml +++ b/.github/workflows/nix-dev-cache.yaml @@ -29,7 +29,7 @@ jobs: if: runner.os == 'Linux' run: | sudo mkdir /nix /mnt/nix - sudo mount -B /mnt/nix /nix + sudo mount --bind /mnt/nix /nix - uses: cachix/install-nix-action@v27 if: runner.os == 'Linux' with: