From 21209e2bdd48a16cca4d74d000d2f0494c78d3f5 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 1 Jul 2024 13:17:38 -0600 Subject: [PATCH 01/22] Extract the `Doc` lexer into a top-level function --- unison-syntax/src/Unison/Syntax/Lexer.hs | 857 ++++++++++++----------- 1 file changed, 429 insertions(+), 428 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 14fe31f9a7..144ccd95c3 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -400,6 +400,435 @@ restoreStack lbl p = do S.put (s2 {layout = layout1}) pure $ p <> closes +-- | The `Doc` lexer as documented on unison-lang.org +doc2 :: P [Token Lexeme] +doc2 = do + -- Ensure we're at a doc before we start consuming tokens + P.lookAhead (lit "{{") + openStart <- posP + -- Produce any layout tokens, such as closing the last open block or virtual semicolons + -- We don't use 'token' on "{{" directly because we don't want to duplicate layout + -- tokens if we do the rewrite hack for type-docs below. + beforeStartToks <- token' ignore (pure ()) + void $ lit "{{" + openEnd <- posP + CP.space + -- Construct the token for opening the doc block. + let openTok = Token (Open "syntax.docUntitledSection") openStart openEnd + env0 <- S.get + -- Disable layout while parsing the doc block and reset the section number + (bodyToks0, closeTok) <- local + ( \env -> + env + { inLayout = False, + parentSections = 0 : (parentSections env0) + } + ) + do + bodyToks <- body + closeStart <- posP + lit "}}" + closeEnd <- posP + pure (bodyToks, Token Close closeStart closeEnd) + let docToks = beforeStartToks <> [openTok] <> bodyToks0 <> [closeTok] + -- Parse any layout tokens after the doc block, e.g. virtual semicolon + endToks <- token' ignore (pure ()) + -- Hack to allow anonymous doc blocks before type decls + -- {{ Some docs }} Foo.doc = {{ Some docs }} + -- ability Foo where => ability Foo where + tn <- subsequentTypeName + pure $ case (tn) of + -- If we're followed by a type, we rewrite the doc block to be a named doc block. + (Just (WordyId tname)) + | isTopLevel -> + beforeStartToks + <> [WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment)) <$ openTok, Open "=" <$ openTok] + <> [openTok] + <> bodyToks0 + <> [closeTok] + -- We need an extra 'Close' here because we added an extra Open above. + <> [closeTok] + <> endToks + where + isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 + _ -> docToks <> endToks + where + wordyKw kw = separated wordySep (lit kw) + subsequentTypeName = P.lookAhead . P.optional $ do + let lit' s = lit s <* sp + let modifier = typeModifiersAlt (lit' . Text.unpack) + let typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) + _ <- optional modifier *> typeOrAbility' *> sp + Token name start stop <- tokenP identifierP + if Name.isSymboly (HQ'.toName name) + then P.customFailure (Token (InvalidSymbolyId (Text.unpack (HQ'.toTextWith Name.toText name))) start stop) + else pure (WordyId name) + ignore _ _ _ = [] + body = join <$> P.many (sectionElem <* CP.space) + sectionElem = section <|> fencedBlock <|> list <|> paragraph + paragraph = wrap "syntax.docParagraph" $ join <$> spaced leaf + reserved word = List.isPrefixOf "}}" word || all (== '#') word + + wordy closing = wrap "syntax.docWord" . tok . fmap Textual . P.try $ do + let end = + P.lookAhead $ + void docClose + <|> void docOpen + <|> void (P.satisfy isSpace) + <|> void closing + word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end + guard (not $ reserved word || null word) + pure word + + leafy closing = groupy closing gs + where + gs = + link + <|> externalLink + <|> exampleInline + <|> expr + <|> boldOrItalicOrStrikethrough closing + <|> verbatim + <|> atDoc + <|> wordy closing + + leaf = leafy mzero + + atDoc = src <|> evalInline <|> signature <|> signatureInline + where + comma = lit "," <* CP.space + src = + src' "syntax.docSource" "@source" + <|> src' "syntax.docFoldedSource" "@foldedSource" + srcElem = + wrap "syntax.docSourceElement" $ + (typeLink <|> termLink) + <+> ( fmap (fromMaybe []) . P.optional $ + (tok (Reserved <$> lit "@") <+> (CP.space *> annotations)) + ) + where + annotation = tok identifierLexemeP <|> expr <* CP.space + annotations = + join <$> P.some (wrap "syntax.docEmbedAnnotation" annotation) + src' name atName = wrap name $ do + _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space + s <- P.sepBy1 srcElem comma + _ <- lit "}" + pure (join s) + signature = wrap "syntax.docSignature" $ do + _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space + s <- join <$> P.sepBy1 signatureLink comma + _ <- lit "}" + pure s + signatureInline = wrap "syntax.docSignatureInline" $ do + _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space + s <- signatureLink + _ <- lit "}" + pure s + evalInline = wrap "syntax.docEvalInline" $ do + _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space + let inlineEvalClose = [] <$ lit "}" + s <- lexemes' inlineEvalClose + pure s + + typeLink = wrap "syntax.docEmbedTypeLink" do + _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space + tok identifierLexemeP <* CP.space + + termLink = + wrap "syntax.docEmbedTermLink" $ + tok identifierLexemeP <* CP.space + + signatureLink = + wrap "syntax.docEmbedSignatureLink" $ + tok identifierLexemeP <* CP.space + + groupy closing p = do + Token p start stop <- tokenP p + after <- P.optional . P.try $ leafy closing + pure $ case after of + Nothing -> p + Just after -> + [ Token (Open "syntax.docGroup") start stop', + Token (Open "syntax.docJoin") start stop' + ] + <> p + <> after + <> (take 2 $ repeat (Token Close stop' stop')) + where + stop' = maybe stop end (lastMay after) + + verbatim = + P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do + Token originalText start stop <- tokenP do + -- a single backtick followed by a non-backtick is treated as monospaced + let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) + -- also two or more ' followed by that number of closing ' + quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) + P.someTill P.anySingle (lit quotes) + let isMultiLine = line start /= line stop + if isMultiLine + then do + let trimmed = (trimAroundDelimiters originalText) + let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed + -- If it's a multi-line verbatim block we trim any whitespace representing + -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' + wrap "syntax.docVerbatim" $ + wrap "syntax.docWord" $ + pure [Token (Textual txt) start stop] + else + wrap "syntax.docCode" $ + wrap "syntax.docWord" $ + pure [Token (Textual originalText) start stop] + + exampleInline = + P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ + wrap "syntax.docExample" $ do + n <- P.try $ do + _ <- lit "`" + length <$> P.takeWhile1P (Just "backticks") (== '`') + let end :: P [Token Lexeme] = [] <$ lit (replicate (n + 1) '`') + ex <- CP.space *> lexemes' end + pure ex + + docClose = [] <$ lit "}}" + docOpen = [] <$ lit "{{" + + link = + P.label "link (examples: {type List}, {Nat.+})" $ + wrap "syntax.docLink" $ + P.try $ + lit "{" *> (typeLink <|> termLink) <* lit "}" + + expr = + P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ + openAs "{{" "syntax.docTransclude" + <+> do + env0 <- S.get + -- we re-allow layout within a transclusion, then restore it to its + -- previous state after + S.put (env0 {inLayout = True}) + -- Note: this P.lookAhead ensures the }} isn't consumed, + -- so it can be consumed below by the `close` which will + -- pop items off the layout stack up to the nearest enclosing + -- syntax.docTransclude. + ts <- lexemes' (P.lookAhead ([] <$ lit "}}")) + S.modify (\env -> env {inLayout = inLayout env0}) + pure ts + <+> close ["syntax.docTransclude"] (lit "}}") + + nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' + nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace + + -- Allows whitespace or a newline, but not more than two newlines in a row. + whitespaceWithoutParagraphBreak :: P () + whitespaceWithoutParagraphBreak = void do + void nonNewlineSpaces + optional newline >>= \case + Just _ -> void nonNewlineSpaces + Nothing -> pure () + + fencedBlock = + P.label "block eval (syntax: a fenced code block)" $ + evalUnison <|> exampleBlock <|> other + where + evalUnison = wrap "syntax.docEval" $ do + -- commit after seeing that ``` is on its own line + fence <- P.try $ do + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) + fence <$ guard b + CP.space + *> local + (\env -> env {inLayout = True, opening = Just "docEval"}) + (restoreStack "docEval" $ lexemes' ([] <$ lit fence)) + + exampleBlock = wrap "syntax.docExampleBlock" $ do + void $ lit "@typecheck" <* CP.space + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + local + (\env -> env {inLayout = True, opening = Just "docExampleBlock"}) + (restoreStack "docExampleBlock" $ lexemes' ([] <$ lit fence)) + + uncolumn column tabWidth s = + let skip col r | col < 1 = r + skip col s@('\t' : _) | col < tabWidth = s + skip col ('\t' : r) = skip (col - tabWidth) r + skip col (c : r) + | isSpace c && (not $ isControl c) = + skip (col - 1) r + skip _ s = s + in List.intercalate "\n" $ skip column <$> lines s + + other = wrap "syntax.docCodeBlock" $ do + column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel + let tabWidth = toInteger . P.unPos $ P.defaultTabWidth + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + name <- + P.takeWhileP Nothing nonNewlineSpace + *> tok (Textual <$> P.takeWhile1P Nothing (not . isSpace)) + <* P.takeWhileP Nothing nonNewlineSpace + _ <- void CP.eol + verbatim <- + tok $ + Textual . uncolumn column tabWidth . trimAroundDelimiters + <$> P.someTill P.anySingle ([] <$ lit fence) + pure (name <> verbatim) + + boldOrItalicOrStrikethrough closing = do + let start = + some (P.satisfy (== '*')) + <|> some (P.satisfy (== '_')) + <|> some + (P.satisfy (== '~')) + name s = + if take 1 s == "~" + then "syntax.docStrikethrough" + else if take 1 s == "*" then "syntax.docBold" else "syntax.docItalic" + end <- P.try $ do + end <- start + P.lookAhead (P.satisfy (not . isSpace)) + pure end + wrap (name end) . wrap "syntax.docParagraph" $ + join + <$> P.someTill + (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) + (lit end) + + externalLink = + P.label "hyperlink (example: [link name](https://destination.com))" $ + wrap "syntax.docNamedLink" $ do + _ <- lit "[" + p <- leafies (void $ char ']') + _ <- lit "]" + _ <- lit "(" + target <- + wrap "syntax.docGroup" . wrap "syntax.docJoin" $ + link <|> fmap join (P.some (expr <|> wordy (char ')'))) + _ <- lit ")" + pure (p <> target) + + -- newline = P.optional (lit "\r") *> lit "\n" + + sp = P.try $ do + spaces <- P.takeWhile1P (Just "space") isSpace + close <- P.optional (P.lookAhead (lit "}}")) + case close of + Nothing -> guard $ ok spaces + Just _ -> pure () + pure spaces + where + ok s = length [() | '\n' <- s] < 2 + + spaced p = P.some (p <* P.optional sp) + leafies close = wrap "syntax.docParagraph" $ join <$> spaced (leafy close) + + list = bulletedList <|> numberedList + + bulletedList = wrap "syntax.docBulletedList" $ join <$> P.sepBy1 bullet listSep + numberedList = wrap "syntax.docNumberedList" $ join <$> P.sepBy1 numberedItem listSep + + listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (bulletedStart <|> numberedStart) + + bulletedStart = P.try $ do + r <- listItemStart' $ [] <$ P.satisfy bulletChar + P.lookAhead (P.satisfy isSpace) + pure r + where + bulletChar ch = ch == '*' || ch == '-' || ch == '+' + + listItemStart' gutter = P.try $ do + nonNewlineSpaces + col <- column <$> posP + parentCol <- S.gets parentListColumn + guard (col > parentCol) + (col,) <$> gutter + + numberedStart = + listItemStart' $ P.try (tok . fmap num $ LP.decimal <* lit ".") + where + num :: Word -> Lexeme + num n = Numeric (show n) + + listItemParagraph = wrap "syntax.docParagraph" $ do + col <- column <$> posP + join <$> P.some (leaf <* sep col) + where + -- Trickiness here to support hard line breaks inside of + -- a bulleted list, so for instance this parses as expected: + -- + -- * uno dos + -- tres quatro + -- * alice bob + -- carol dave eve + sep col = do + _ <- nonNewlineSpaces + _ <- + P.optional . P.try $ + newline + *> nonNewlineSpaces + *> do + col2 <- column <$> posP + guard $ col2 >= col + (P.notFollowedBy $ numberedStart <|> bulletedStart) + pure () + + numberedItem = P.label msg $ do + (col, s) <- numberedStart + pure s + <+> ( wrap "syntax.docColumn" $ do + p <- nonNewlineSpaces *> listItemParagraph + subList <- + local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) + pure (p <> fromMaybe [] subList) + ) + where + msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" + + bullet = wrap "syntax.docColumn" . P.label "bullet (examples: * item1, - item2)" $ do + (col, _) <- bulletedStart + p <- nonNewlineSpaces *> listItemParagraph + subList <- + local + (\e -> e {parentListColumn = col}) + (P.optional $ listSep *> list) + pure (p <> fromMaybe [] subList) + + newline = P.label "newline" $ lit "\n" <|> lit "\r\n" + + -- ## Section title + -- + -- A paragraph under this section. + -- Part of the same paragraph. Blanklines separate paragraphs. + -- + -- ### A subsection title + -- + -- A paragraph under this subsection. + + -- # A section title (not a subsection) + section :: P [Token Lexeme] + section = wrap "syntax.docSection" $ do + ns <- S.gets parentSections + hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp + title <- paragraph <* CP.space + let m = length hashes + head ns + body <- + local (\env -> env {parentSections = (m : (tail ns))}) $ + P.many (sectionElem <* CP.space) + pure $ title <> join body + + wrap :: String -> P [Token Lexeme] -> P [Token Lexeme] + wrap o p = do + start <- posP + lexemes <- p + pure $ go start lexemes + where + go start [] = [Token (Open o) start start, Token Close start start] + go start ts@(Token _ x _ : _) = + Token (Open o) start x : (ts ++ [Token Close (end final) (end final)]) + where + final = last ts + lexemes' :: P [Token Lexeme] -> P [Token Lexeme] lexemes' eof = P.optional space >> do @@ -418,434 +847,6 @@ lexemes' eof = <|> token identifierLexemeP <|> (asum . map token) [semi, textual, hash] - doc2 :: P [Token Lexeme] - doc2 = do - -- Ensure we're at a doc before we start consuming tokens - P.lookAhead (lit "{{") - openStart <- posP - -- Produce any layout tokens, such as closing the last open block or virtual semicolons - -- We don't use 'token' on "{{" directly because we don't want to duplicate layout - -- tokens if we do the rewrite hack for type-docs below. - beforeStartToks <- token' ignore (pure ()) - void $ lit "{{" - openEnd <- posP - CP.space - -- Construct the token for opening the doc block. - let openTok = Token (Open "syntax.docUntitledSection") openStart openEnd - env0 <- S.get - -- Disable layout while parsing the doc block and reset the section number - (bodyToks0, closeTok) <- local - ( \env -> - env - { inLayout = False, - parentSections = 0 : (parentSections env0) - } - ) - do - bodyToks <- body - closeStart <- posP - lit "}}" - closeEnd <- posP - pure (bodyToks, Token Close closeStart closeEnd) - let docToks = beforeStartToks <> [openTok] <> bodyToks0 <> [closeTok] - -- Parse any layout tokens after the doc block, e.g. virtual semicolon - endToks <- token' ignore (pure ()) - -- Hack to allow anonymous doc blocks before type decls - -- {{ Some docs }} Foo.doc = {{ Some docs }} - -- ability Foo where => ability Foo where - tn <- subsequentTypeName - pure $ case (tn) of - -- If we're followed by a type, we rewrite the doc block to be a named doc block. - (Just (WordyId tname)) - | isTopLevel -> - beforeStartToks - <> [WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment)) <$ openTok, Open "=" <$ openTok] - <> [openTok] - <> bodyToks0 - <> [closeTok] - -- We need an extra 'Close' here because we added an extra Open above. - <> [closeTok] - <> endToks - where - isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 - _ -> docToks <> endToks - where - wordyKw kw = separated wordySep (lit kw) - subsequentTypeName = P.lookAhead . P.optional $ do - let lit' s = lit s <* sp - let modifier = typeModifiersAlt (lit' . Text.unpack) - let typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) - _ <- optional modifier *> typeOrAbility' *> sp - Token name start stop <- tokenP identifierP - if Name.isSymboly (HQ'.toName name) - then P.customFailure (Token (InvalidSymbolyId (Text.unpack (HQ'.toTextWith Name.toText name))) start stop) - else pure (WordyId name) - ignore _ _ _ = [] - body = join <$> P.many (sectionElem <* CP.space) - sectionElem = section <|> fencedBlock <|> list <|> paragraph - paragraph = wrap "syntax.docParagraph" $ join <$> spaced leaf - reserved word = List.isPrefixOf "}}" word || all (== '#') word - - wordy closing = wrap "syntax.docWord" . tok . fmap Textual . P.try $ do - let end = - P.lookAhead $ - void docClose - <|> void docOpen - <|> void (P.satisfy isSpace) - <|> void closing - word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end - guard (not $ reserved word || null word) - pure word - - leafy closing = groupy closing gs - where - gs = - link - <|> externalLink - <|> exampleInline - <|> expr - <|> boldOrItalicOrStrikethrough closing - <|> verbatim - <|> atDoc - <|> wordy closing - - leaf = leafy mzero - - atDoc = src <|> evalInline <|> signature <|> signatureInline - where - comma = lit "," <* CP.space - src = - src' "syntax.docSource" "@source" - <|> src' "syntax.docFoldedSource" "@foldedSource" - srcElem = - wrap "syntax.docSourceElement" $ - (typeLink <|> termLink) - <+> ( fmap (fromMaybe []) . P.optional $ - (tok (Reserved <$> lit "@") <+> (CP.space *> annotations)) - ) - where - annotation = tok identifierLexemeP <|> expr <* CP.space - annotations = - join <$> P.some (wrap "syntax.docEmbedAnnotation" annotation) - src' name atName = wrap name $ do - _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space - s <- P.sepBy1 srcElem comma - _ <- lit "}" - pure (join s) - signature = wrap "syntax.docSignature" $ do - _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space - s <- join <$> P.sepBy1 signatureLink comma - _ <- lit "}" - pure s - signatureInline = wrap "syntax.docSignatureInline" $ do - _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space - s <- signatureLink - _ <- lit "}" - pure s - evalInline = wrap "syntax.docEvalInline" $ do - _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space - let inlineEvalClose = [] <$ lit "}" - s <- lexemes' inlineEvalClose - pure s - - typeLink = wrap "syntax.docEmbedTypeLink" do - _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tok identifierLexemeP <* CP.space - - termLink = - wrap "syntax.docEmbedTermLink" $ - tok identifierLexemeP <* CP.space - - signatureLink = - wrap "syntax.docEmbedSignatureLink" $ - tok identifierLexemeP <* CP.space - - groupy closing p = do - Token p start stop <- tokenP p - after <- P.optional . P.try $ leafy closing - pure $ case after of - Nothing -> p - Just after -> - [ Token (Open "syntax.docGroup") start stop', - Token (Open "syntax.docJoin") start stop' - ] - <> p - <> after - <> (take 2 $ repeat (Token Close stop' stop')) - where - stop' = maybe stop end (lastMay after) - - verbatim = - P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do - Token originalText start stop <- tokenP do - -- a single backtick followed by a non-backtick is treated as monospaced - let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) - -- also two or more ' followed by that number of closing ' - quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) - P.someTill P.anySingle (lit quotes) - let isMultiLine = line start /= line stop - if isMultiLine - then do - let trimmed = (trimAroundDelimiters originalText) - let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed - -- If it's a multi-line verbatim block we trim any whitespace representing - -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - wrap "syntax.docVerbatim" $ - wrap "syntax.docWord" $ - pure [Token (Textual txt) start stop] - else - wrap "syntax.docCode" $ - wrap "syntax.docWord" $ - pure [Token (Textual originalText) start stop] - - exampleInline = - P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ - wrap "syntax.docExample" $ do - n <- P.try $ do - _ <- lit "`" - length <$> P.takeWhile1P (Just "backticks") (== '`') - let end :: P [Token Lexeme] = [] <$ lit (replicate (n + 1) '`') - ex <- CP.space *> lexemes' end - pure ex - - docClose = [] <$ lit "}}" - docOpen = [] <$ lit "{{" - - link = - P.label "link (examples: {type List}, {Nat.+})" $ - wrap "syntax.docLink" $ - P.try $ - lit "{" *> (typeLink <|> termLink) <* lit "}" - - expr = - P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ - openAs "{{" "syntax.docTransclude" - <+> do - env0 <- S.get - -- we re-allow layout within a transclusion, then restore it to its - -- previous state after - S.put (env0 {inLayout = True}) - -- Note: this P.lookAhead ensures the }} isn't consumed, - -- so it can be consumed below by the `close` which will - -- pop items off the layout stack up to the nearest enclosing - -- syntax.docTransclude. - ts <- lexemes' (P.lookAhead ([] <$ lit "}}")) - S.modify (\env -> env {inLayout = inLayout env0}) - pure ts - <+> close ["syntax.docTransclude"] (lit "}}") - - nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' - nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace - - -- Allows whitespace or a newline, but not more than two newlines in a row. - whitespaceWithoutParagraphBreak :: P () - whitespaceWithoutParagraphBreak = void do - void nonNewlineSpaces - optional newline >>= \case - Just _ -> void nonNewlineSpaces - Nothing -> pure () - - fencedBlock = - P.label "block eval (syntax: a fenced code block)" $ - evalUnison <|> exampleBlock <|> other - where - evalUnison = wrap "syntax.docEval" $ do - -- commit after seeing that ``` is on its own line - fence <- P.try $ do - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) - fence <$ guard b - CP.space - *> local - (\env -> env {inLayout = True, opening = Just "docEval"}) - (restoreStack "docEval" $ lexemes' ([] <$ lit fence)) - - exampleBlock = wrap "syntax.docExampleBlock" $ do - void $ lit "@typecheck" <* CP.space - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - local - (\env -> env {inLayout = True, opening = Just "docExampleBlock"}) - (restoreStack "docExampleBlock" $ lexemes' ([] <$ lit fence)) - - uncolumn column tabWidth s = - let skip col r | col < 1 = r - skip col s@('\t' : _) | col < tabWidth = s - skip col ('\t' : r) = skip (col - tabWidth) r - skip col (c : r) - | isSpace c && (not $ isControl c) = - skip (col - 1) r - skip _ s = s - in List.intercalate "\n" $ skip column <$> lines s - - other = wrap "syntax.docCodeBlock" $ do - column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel - let tabWidth = toInteger . P.unPos $ P.defaultTabWidth - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - name <- - P.takeWhileP Nothing nonNewlineSpace - *> tok (Textual <$> P.takeWhile1P Nothing (not . isSpace)) - <* P.takeWhileP Nothing nonNewlineSpace - _ <- void CP.eol - verbatim <- - tok $ - Textual . uncolumn column tabWidth . trimAroundDelimiters - <$> P.someTill P.anySingle ([] <$ lit fence) - pure (name <> verbatim) - - boldOrItalicOrStrikethrough closing = do - let start = - some (P.satisfy (== '*')) - <|> some (P.satisfy (== '_')) - <|> some - (P.satisfy (== '~')) - name s = - if take 1 s == "~" - then "syntax.docStrikethrough" - else if take 1 s == "*" then "syntax.docBold" else "syntax.docItalic" - end <- P.try $ do - end <- start - P.lookAhead (P.satisfy (not . isSpace)) - pure end - wrap (name end) . wrap "syntax.docParagraph" $ - join - <$> P.someTill - (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) - (lit end) - - externalLink = - P.label "hyperlink (example: [link name](https://destination.com))" $ - wrap "syntax.docNamedLink" $ do - _ <- lit "[" - p <- leafies (void $ char ']') - _ <- lit "]" - _ <- lit "(" - target <- - wrap "syntax.docGroup" . wrap "syntax.docJoin" $ - link <|> fmap join (P.some (expr <|> wordy (char ')'))) - _ <- lit ")" - pure (p <> target) - - -- newline = P.optional (lit "\r") *> lit "\n" - - sp = P.try $ do - spaces <- P.takeWhile1P (Just "space") isSpace - close <- P.optional (P.lookAhead (lit "}}")) - case close of - Nothing -> guard $ ok spaces - Just _ -> pure () - pure spaces - where - ok s = length [() | '\n' <- s] < 2 - - spaced p = P.some (p <* P.optional sp) - leafies close = wrap "syntax.docParagraph" $ join <$> spaced (leafy close) - - list = bulletedList <|> numberedList - - bulletedList = wrap "syntax.docBulletedList" $ join <$> P.sepBy1 bullet listSep - numberedList = wrap "syntax.docNumberedList" $ join <$> P.sepBy1 numberedItem listSep - - listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (bulletedStart <|> numberedStart) - - bulletedStart = P.try $ do - r <- listItemStart' $ [] <$ P.satisfy bulletChar - P.lookAhead (P.satisfy isSpace) - pure r - where - bulletChar ch = ch == '*' || ch == '-' || ch == '+' - - listItemStart' gutter = P.try $ do - nonNewlineSpaces - col <- column <$> posP - parentCol <- S.gets parentListColumn - guard (col > parentCol) - (col,) <$> gutter - - numberedStart = - listItemStart' $ P.try (tok . fmap num $ LP.decimal <* lit ".") - where - num :: Word -> Lexeme - num n = Numeric (show n) - - listItemParagraph = wrap "syntax.docParagraph" $ do - col <- column <$> posP - join <$> P.some (leaf <* sep col) - where - -- Trickiness here to support hard line breaks inside of - -- a bulleted list, so for instance this parses as expected: - -- - -- * uno dos - -- tres quatro - -- * alice bob - -- carol dave eve - sep col = do - _ <- nonNewlineSpaces - _ <- - P.optional . P.try $ - newline - *> nonNewlineSpaces - *> do - col2 <- column <$> posP - guard $ col2 >= col - (P.notFollowedBy $ numberedStart <|> bulletedStart) - pure () - - numberedItem = P.label msg $ do - (col, s) <- numberedStart - pure s - <+> ( wrap "syntax.docColumn" $ do - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) - pure (p <> fromMaybe [] subList) - ) - where - msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" - - bullet = wrap "syntax.docColumn" . P.label "bullet (examples: * item1, - item2)" $ do - (col, _) <- bulletedStart - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local - (\e -> e {parentListColumn = col}) - (P.optional $ listSep *> list) - pure (p <> fromMaybe [] subList) - - newline = P.label "newline" $ lit "\n" <|> lit "\r\n" - - -- ## Section title - -- - -- A paragraph under this section. - -- Part of the same paragraph. Blanklines separate paragraphs. - -- - -- ### A subsection title - -- - -- A paragraph under this subsection. - - -- # A section title (not a subsection) - section :: P [Token Lexeme] - section = wrap "syntax.docSection" $ do - ns <- S.gets parentSections - hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp - title <- paragraph <* CP.space - let m = length hashes + head ns - body <- - local (\env -> env {parentSections = (m : (tail ns))}) $ - P.many (sectionElem <* CP.space) - pure $ title <> join body - - wrap :: String -> P [Token Lexeme] -> P [Token Lexeme] - wrap o p = do - start <- posP - lexemes <- p - pure $ go start lexemes - where - go start [] = [Token (Open o) start start, Token Close start start] - go start ts@(Token _ x _ : _) = - Token (Open o) start x : (ts ++ [Token Close (end final) (end final)]) - where - final = last ts - doc :: P [Token Lexeme] doc = open <+> (CP.space *> fmap fixup body) <+> (close <* space) where From d1fe6d9429ac5ed69b101fde6bbc353195f2edf5 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 1 Jul 2024 13:43:06 -0600 Subject: [PATCH 02/22] Separate the `Doc` lexer from the Unison lexer `doc2` is a Unison lexer that traverses a `Doc`. `docBody` is the actual `Doc` lexer that is ignorant of the fact that Unison wraps `Doc` blocks in `{{`/`}}`. --- unison-syntax/src/Unison/Syntax/Lexer.hs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 144ccd95c3..77c91b8e84 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -425,7 +425,7 @@ doc2 = do } ) do - bodyToks <- body + bodyToks <- docBody closeStart <- posP lit "}}" closeEnd <- posP @@ -453,6 +453,7 @@ doc2 = do isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 _ -> docToks <> endToks where + -- DUPLICATED wordyKw kw = separated wordySep (lit kw) subsequentTypeName = P.lookAhead . P.optional $ do let lit' s = lit s <* sp @@ -464,7 +465,23 @@ doc2 = do then P.customFailure (Token (InvalidSymbolyId (Text.unpack (HQ'.toTextWith Name.toText name))) start stop) else pure (WordyId name) ignore _ _ _ = [] - body = join <$> P.many (sectionElem <* CP.space) + -- DUPLICATED + sp = P.try $ do + spaces <- P.takeWhile1P (Just "space") isSpace + close <- P.optional (P.lookAhead (lit "}}")) + case close of + Nothing -> guard $ ok spaces + Just _ -> pure () + pure spaces + where + ok s = length [() | '\n' <- s] < 2 + +-- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that +-- Unison wraps `Doc` literals in `}}`). +docBody :: P [Token Lexeme] +docBody = join <$> P.many (sectionElem <* CP.space) + where + wordyKw kw = separated wordySep (lit kw) sectionElem = section <|> fencedBlock <|> list <|> paragraph paragraph = wrap "syntax.docParagraph" $ join <$> spaced leaf reserved word = List.isPrefixOf "}}" word || all (== '#') word From 543daa36c74ab0bf1d4643b37b4baef30de88994 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 3 Jul 2024 12:59:09 -0600 Subject: [PATCH 03/22] Move the `Annotated` class to the `Ann` module This is in preparation for using `Ann` in the `Lexer` module, as that module actually does some parsing. --- unison-syntax/src/Unison/Parser/Ann.hs | 21 +++++++++++++++++++++ unison-syntax/src/Unison/Syntax/Lexer.hs | 5 +++++ unison-syntax/src/Unison/Syntax/Parser.hs | 16 ++-------------- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/unison-syntax/src/Unison/Parser/Ann.hs b/unison-syntax/src/Unison/Parser/Ann.hs index feec96279c..961bbcb30c 100644 --- a/unison-syntax/src/Unison/Parser/Ann.hs +++ b/unison-syntax/src/Unison/Parser/Ann.hs @@ -4,7 +4,10 @@ module Unison.Parser.Ann where +import Data.List.NonEmpty (NonEmpty) +import Data.Void (absurd) import Unison.Lexer.Pos qualified as L +import Unison.Prelude data Ann = -- Used for things like Builtins which don't have a source position. @@ -79,3 +82,21 @@ encompasses (GeneratedFrom ann) other = encompasses ann other encompasses ann (GeneratedFrom other) = encompasses ann other encompasses (Ann start1 end1) (Ann start2 end2) = Just $ start1 <= start2 && end1 >= end2 + +class Annotated a where + ann :: a -> Ann + +instance Annotated Ann where + ann = id + +instance (Annotated a) => Annotated [a] where + ann = foldMap ann + +instance (Annotated a) => Annotated (NonEmpty a) where + ann = foldMap ann + +instance (Annotated a) => Annotated (Maybe a) where + ann = foldMap ann + +instance Annotated Void where + ann = absurd diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 77c91b8e84..fa169e2d06 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -1,4 +1,5 @@ {-# LANGUAGE TemplateHaskell #-} +{-# OPTIONS_GHC -Wno-orphans #-} module Unison.Syntax.Lexer ( Token (..), @@ -51,6 +52,7 @@ import Unison.Name qualified as Name import Unison.NameSegment (NameSegment) import Unison.NameSegment qualified as NameSegment (docSegment) import Unison.NameSegment.Internal qualified as NameSegment +import Unison.Parser.Ann (Ann (Ann), Annotated (..)) import Unison.Prelude import Unison.ShortHash (ShortHash) import Unison.ShortHash qualified as SH @@ -64,6 +66,9 @@ import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) import Unison.Util.Bytes qualified as Bytes import Unison.Util.Monoid (intercalateMap) +instance Annotated (Token a) where + ann (Token _ s e) = Ann s e + type BlockName = String type Layout = [(BlockName, Column)] diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index affab5bf2c..4945f4347e 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -1,4 +1,5 @@ {-# LANGUAGE UndecidableInstances #-} +{-# OPTIONS_GHC -Wno-orphans #-} module Unison.Syntax.Parser ( Annotated (..), @@ -77,7 +78,7 @@ import Unison.Hashable qualified as Hashable import Unison.Name as Name import Unison.Names (Names) import Unison.Names.ResolutionResult qualified as Names -import Unison.Parser.Ann (Ann (..)) +import Unison.Parser.Ann (Ann (..), Annotated (..)) import Unison.Pattern (Pattern) import Unison.Pattern qualified as Pattern import Unison.Prelude @@ -177,25 +178,12 @@ newtype Input = Input {inputStream :: [L.Token L.Lexeme]} deriving stock (Eq, Ord, Show) deriving newtype (P.Stream, P.VisualStream) -class Annotated a where - ann :: a -> Ann - -instance Annotated Ann where - ann = id - -instance Annotated (L.Token a) where - ann (L.Token _ s e) = Ann s e - instance (Annotated a) => Annotated (ABT.Term f v a) where ann = ann . ABT.annotation instance (Annotated a) => Annotated (Pattern a) where ann = ann . Pattern.loc -instance (Annotated a) => Annotated [a] where - ann [] = mempty - ann (h : t) = foldl' (\acc a -> acc <> ann a) (ann h) t - instance (Annotated a, Annotated b) => Annotated (MatchCase a b) where ann (MatchCase p _ b) = ann p <> ann b From 5f87b4152739662e20120810ee0525228f9a3363 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 1 Jul 2024 21:11:53 -0600 Subject: [PATCH 04/22] Un-hiding the `Doc` parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `doc2` was a parser in lexer’s clothing. It would parse recursively, but then return the result as a flat list of tokens. This separates the parsing from the “unparsing” (which returns the tokens), so now we have a parser to a recursive `Doc` structure. This currently immediately applies the unparser, and should result in an identical stream of tokens as the previous version. Eventually, we should be able to avoid unparsing the `Doc` structure. --- unison-syntax/package.yaml | 1 + unison-syntax/src/Unison/Syntax/Lexer.hs | 451 ++++++++++++++++------- unison-syntax/unison-syntax.cabal | 2 + 3 files changed, 330 insertions(+), 124 deletions(-) diff --git a/unison-syntax/package.yaml b/unison-syntax/package.yaml index 8e1a478baf..ccb1a057d7 100644 --- a/unison-syntax/package.yaml +++ b/unison-syntax/package.yaml @@ -10,6 +10,7 @@ dependencies: - containers - cryptonite - extra + - free - lens - megaparsec - mtl diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index fa169e2d06..fd27118050 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -8,6 +8,16 @@ module Unison.Syntax.Lexer Err (..), Pos (..), Lexeme (..), + DocTree, + DocUntitledSection (..), + DocTop (..), + DocColumn (..), + DocLeaf (..), + DocEmbedLink (..), + DocSourceElement (..), + DocEmbedSignatureLink (..), + DocJoin (..), + DocEmbedAnnotation (..), lexer, escapeChars, debugFileLex, @@ -28,16 +38,19 @@ module Unison.Syntax.Lexer ) where +import Control.Comonad.Cofree (Cofree ((:<))) import Control.Monad.State qualified as S import Data.Char (isAlphaNum, isControl, isDigit, isSpace, ord, toLower) import Data.Foldable qualified as Foldable import Data.List qualified as List import Data.List.Extra qualified as List +import Data.List.NonEmpty (NonEmpty ((:|))) import Data.List.NonEmpty qualified as Nel import Data.List.NonEmpty qualified as NonEmpty import Data.Map.Strict qualified as Map import Data.Set qualified as Set import Data.Text qualified as Text +import Data.Void (vacuous) import GHC.Exts (sortWith) import Text.Megaparsec qualified as P import Text.Megaparsec.Char (char) @@ -52,7 +65,7 @@ import Unison.Name qualified as Name import Unison.NameSegment (NameSegment) import Unison.NameSegment qualified as NameSegment (docSegment) import Unison.NameSegment.Internal qualified as NameSegment -import Unison.Parser.Ann (Ann (Ann), Annotated (..)) +import Unison.Parser.Ann (Ann (Ann, GeneratedFrom), Annotated (..)) import Unison.Prelude import Unison.ShortHash (ShortHash) import Unison.ShortHash qualified as SH @@ -66,6 +79,9 @@ import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) import Unison.Util.Bytes qualified as Bytes import Unison.Util.Monoid (intercalateMap) +instance (Annotated a) => Annotated (Cofree f a) where + ann (a :< _) = ann a + instance Annotated (Token a) where ann (Token _ s e) = Ann s e @@ -418,11 +434,9 @@ doc2 = do void $ lit "{{" openEnd <- posP CP.space - -- Construct the token for opening the doc block. - let openTok = Token (Open "syntax.docUntitledSection") openStart openEnd env0 <- S.get -- Disable layout while parsing the doc block and reset the section number - (bodyToks0, closeTok) <- local + (docToks, closeTok) <- local ( \env -> env { inLayout = False, @@ -430,33 +444,30 @@ doc2 = do } ) do - bodyToks <- docBody + bodyToks <- docBody (lit "}}") closeStart <- posP lit "}}" closeEnd <- posP - pure (bodyToks, Token Close closeStart closeEnd) - let docToks = beforeStartToks <> [openTok] <> bodyToks0 <> [closeTok] + pure (docToLexemes (openStart, closeEnd) bodyToks, Token Close closeStart closeEnd) -- Parse any layout tokens after the doc block, e.g. virtual semicolon endToks <- token' ignore (pure ()) -- Hack to allow anonymous doc blocks before type decls -- {{ Some docs }} Foo.doc = {{ Some docs }} -- ability Foo where => ability Foo where tn <- subsequentTypeName - pure $ case (tn) of - -- If we're followed by a type, we rewrite the doc block to be a named doc block. - (Just (WordyId tname)) - | isTopLevel -> - beforeStartToks - <> [WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment)) <$ openTok, Open "=" <$ openTok] - <> [openTok] - <> bodyToks0 - <> [closeTok] - -- We need an extra 'Close' here because we added an extra Open above. - <> [closeTok] - <> endToks - where - isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 - _ -> docToks <> endToks + pure $ + beforeStartToks <> case (tn) of + -- If we're followed by a type, we rewrite the doc block to be a named doc block. + Just (WordyId tname) + | isTopLevel -> + Token (WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment))) openStart openEnd + : Token (Open "=") openStart openEnd + : docToks + -- We need an extra 'Close' here because we added an extra Open above. + <> (closeTok : endToks) + where + isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 + _ -> docToks <> endToks where -- DUPLICATED wordyKw kw = separated wordySep (lit kw) @@ -481,17 +492,221 @@ doc2 = do where ok s = length [() | '\n' <- s] < 2 +-- | Like `P.some`, but returns an actual `NonEmpty`. +some' :: P a -> P (NonEmpty a) +some' p = liftA2 (:|) p $ many p + +-- | Like `P.someTill`, but returns an actual `NonEmpty`. +someTill' :: P a -> P end -> P (NonEmpty a) +someTill' p end = liftA2 (:|) p $ P.manyTill p end + +-- | Like `P.sepBy1`, but returns an actual `NonEmpty`. +sepBy1' :: P a -> P sep -> P (NonEmpty a) +sepBy1' p sep = liftA2 (:|) p . many $ sep *> p + +newtype DocUntitledSection a = DocUntitledSection [a] + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +-- | Haskell parallel to @unison/base.Doc@. +-- +-- This is much more restricted than @unison/base.Doc@, but it covers everything we can parse from Haskell. The +-- mismatch with Unison is a problem, as someone can create a Unison Doc with explicit constructors or function calls, +-- have it rendered to a scratch file, and then we can’t parse it. Changing the types here to match Unison wouldn’t +-- fix the issue. We have to modify the types and parser in concert (in both Haskell and Unison) to bring them in +-- line. +-- +-- __NB__: Uses of @[`Token` `Lexeme`]@ here indicate a nested transition to the Unison lexer. +data DocTop a + = -- | The first argument is always a Paragraph + DocSection a [a] + | DocEval [Token Lexeme] + | DocExampleBlock [Token Lexeme] + | DocCodeBlock (Token String) (Token String) + | DocBulletedList (NonEmpty (DocColumn a)) + | DocNumberedList (NonEmpty (Token Word64, DocColumn a)) + | DocParagraph (NonEmpty (DocLeaf a)) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +data DocColumn a + = -- | The first is always a Paragraph, and the second a Bulleted or Numbered List + DocColumn a (Maybe a) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +data DocLeaf a + = DocLink DocEmbedLink + | -- | first is a Paragraph, second is always a Group (which contains either a single Term/Type link or list of + -- Transcludes & Words) + DocNamedLink a (DocLeaf Void) + | DocExample [Token Lexeme] + | DocTransclude [Token Lexeme] + | -- | Always a Paragraph + DocBold a + | -- | Always a Paragraph + DocItalic a + | -- | Always a Paragraph + DocStrikethrough a + | -- | Always a Word + DocVerbatim (DocLeaf Void) + | -- | Always a Word + DocCode (DocLeaf Void) + | DocSource (NonEmpty DocSourceElement) + | DocFoldedSource (NonEmpty DocSourceElement) + | DocEvalInline [Token Lexeme] + | DocSignature (NonEmpty DocEmbedSignatureLink) + | DocSignatureInline DocEmbedSignatureLink + | DocWord (Token String) + | DocGroup (DocJoin a) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +data DocEmbedLink + = DocEmbedTypeLink (Token (HQ'.HashQualified Name)) + | DocEmbedTermLink (Token (HQ'.HashQualified Name)) + deriving (Eq, Ord, Show) + +data DocSourceElement = DocSourceElement DocEmbedLink [DocEmbedAnnotation] + deriving (Eq, Ord, Show) + +newtype DocEmbedSignatureLink = DocEmbedSignatureLink (Token (HQ'.HashQualified Name)) + deriving (Eq, Ord, Show) + +newtype DocJoin a = DocJoin (NonEmpty (DocLeaf a)) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +newtype DocEmbedAnnotation + = -- | Always a DocTransclude + DocEmbedAnnotation (Either (Token (HQ'.HashQualified Name)) (DocLeaf Void)) + deriving (Eq, Ord, Show) + +type DocTree = Cofree DocTop Ann + +instance (Annotated a) => Annotated (DocTop a) where + ann = \case + DocSection title body -> ann title <> ann body + DocEval code -> ann code + DocExampleBlock code -> ann code + DocCodeBlock label body -> ann label <> ann body + DocBulletedList items -> ann items + DocNumberedList items -> ann $ snd <$> items + DocParagraph leaves -> ann leaves + +instance (Annotated a) => Annotated (DocColumn a) where + ann (DocColumn para list) = ann para <> ann list + +instance (Annotated a) => Annotated (DocLeaf a) where + ann = \case + DocLink link -> ann link + DocNamedLink label target -> ann label <> ann target + DocExample code -> ann code + DocTransclude code -> ann code + DocBold para -> ann para + DocItalic para -> ann para + DocStrikethrough para -> ann para + DocVerbatim word -> ann word + DocCode word -> ann word + DocSource elems -> ann elems + DocFoldedSource elems -> ann elems + DocEvalInline code -> ann code + DocSignature links -> ann links + DocSignatureInline link -> ann link + DocWord text -> ann text + DocGroup (DocJoin leaves) -> ann leaves + +instance Annotated DocEmbedLink where + ann = \case + DocEmbedTypeLink name -> ann name + DocEmbedTermLink name -> ann name + +instance Annotated DocSourceElement where + ann (DocSourceElement link target) = ann link <> ann target + +instance Annotated DocEmbedSignatureLink where + ann (DocEmbedSignatureLink name) = ann name + +instance Annotated DocEmbedAnnotation where + ann (DocEmbedAnnotation a) = either ann ann a + +-- | This is a short-term hack to turn our parse tree back into the sequence of lexemes the current parser expects. +-- +-- The medium-term solution is to preserve @[`DocTree`]@ as its own lexeme type, and hand it to the parser without +-- flattening it back to tokens. Longer-term, maybe we add a real lexer for @Doc@, and then whatever is left of this +-- parser moves into the actual parser. +docToLexemes :: (Pos, Pos) -> DocUntitledSection DocTree -> [Token Lexeme] +docToLexemes (startDoc, endDoc) (DocUntitledSection tops) = + Token (Open "syntax.docUntitledSection") startDoc startDoc + : concatMap cata tops <> pure (Token Close endDoc endDoc) + where + wrap :: Ann -> String -> [Token Lexeme] -> [Token Lexeme] + wrap ann suffix lexemes = go (extractStart ann) lexemes + where + extractStart = \case + Ann start _ -> start + GeneratedFrom a -> extractStart a + a -> error $ "expected a good Pos! Got: " <> show a + o = "syntax.doc" <> suffix + go start [] = [Token (Open o) start start, Token Close start start] + go start ts@(Token _ x _ : _) = + Token (Open o) start x : (ts ++ [Token Close (end final) (end final)]) + where + final = last ts + cata :: DocTree -> [Token Lexeme] + cata (a :< top) = docTop a $ cata <$> top + docTop start = \case + DocSection title body -> wrap start "Section" $ title <> join body + DocEval code -> wrap start "Eval" code + DocExampleBlock code -> wrap start "ExampleBlock" code + DocCodeBlock label text -> wrap start "CodeBlock" [Textual <$> label, Textual <$> text] + DocBulletedList items -> wrap start "BulletedList" . concat $ (\col -> docColumn (ann col) col) <$> items + DocNumberedList items -> + wrap start "NumberedList" . concat $ + uncurry (:) . bimap (Numeric . show <$>) (\col -> docColumn (ann col) col) + <$> items + DocParagraph body -> wrap start "Paragraph" . concat $ (\l -> docLeaf (ann l) l) <$> body + docColumn start (DocColumn para mlist) = wrap start "Column" $ foldr (flip (<>)) para mlist + docLeaf start = \case + DocLink link -> wrap start "Link" $ docEmbedLink (ann link) link + DocNamedLink name target -> wrap start "NamedLink" $ name <> docLeaf (ann target) (vacuous target) + DocExample code -> wrap start "Example" code + DocTransclude code -> wrap start "Transclude" code + DocBold para -> wrap start "Bold" para + DocItalic para -> wrap start "Italic" para + DocStrikethrough para -> wrap start "Strikethrough" para + DocVerbatim word -> wrap start "Verbatim" . docLeaf (ann word) $ vacuous word + DocCode word -> wrap start "Code" . docLeaf (ann word) $ vacuous word + DocSource elems -> wrap start "Source" . concat $ (\e -> docSourceElement (ann e) e) <$> elems + DocFoldedSource elems -> wrap start "FoldedSource" . concat $ (\e -> docSourceElement (ann e) e) <$> elems + DocEvalInline code -> wrap start "EvalInline" code + DocSignature links -> wrap start "Signature" . concat $ (\l -> docEmbedSignatureLink (ann l) l) <$> links + DocSignatureInline link -> wrap start "SignatureInline" $ docEmbedSignatureLink (ann link) link + DocWord text -> wrap start "Word" . pure $ Textual <$> text + DocGroup (DocJoin leaves) -> + wrap start "Group" . wrap start "Join" . concat $ (\l -> docLeaf (ann l) l) <$> leaves + docEmbedLink start = \case + DocEmbedTypeLink ident -> wrap start "EmbedTypeLink" . pure $ identifierLexeme <$> ident + DocEmbedTermLink ident -> wrap start "EmbedTermLink" . pure $ identifierLexeme <$> ident + docSourceElement start (DocSourceElement link anns) = + wrap start "SourceElement" $ + docEmbedLink (ann link) link + <> maybe + [] + ((Token (Reserved "@") (Pos 0 0) (Pos 0 0) :) . concatMap (\a -> docEmbedAnnotation (ann a) a)) + (NonEmpty.nonEmpty anns) + docEmbedSignatureLink start (DocEmbedSignatureLink ident) = + wrap start "EmbedSignatureLink" . pure $ identifierLexeme <$> ident + docEmbedAnnotation start (DocEmbedAnnotation a) = + wrap start "EmbedAnnotation" $ either (pure . fmap identifierLexeme) (\l -> docLeaf (ann l) $ vacuous l) a + -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). -docBody :: P [Token Lexeme] -docBody = join <$> P.many (sectionElem <* CP.space) +docBody :: P end -> P (DocUntitledSection DocTree) +docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) where wordyKw kw = separated wordySep (lit kw) sectionElem = section <|> fencedBlock <|> list <|> paragraph - paragraph = wrap "syntax.docParagraph" $ join <$> spaced leaf + paragraph = wrap' . DocParagraph <$> spaced leaf reserved word = List.isPrefixOf "}}" word || all (== '#') word - wordy closing = wrap "syntax.docWord" . tok . fmap Textual . P.try $ do + wordy :: P end -> P (DocLeaf void) + wordy closing = fmap DocWord . tokenP . P.try $ do let end = P.lookAhead $ void docClose @@ -520,65 +735,61 @@ docBody = join <$> P.many (sectionElem <* CP.space) where comma = lit "," <* CP.space src = - src' "syntax.docSource" "@source" - <|> src' "syntax.docFoldedSource" "@foldedSource" + src' DocSource "@source" + <|> src' DocFoldedSource "@foldedSource" srcElem = - wrap "syntax.docSourceElement" $ - (typeLink <|> termLink) - <+> ( fmap (fromMaybe []) . P.optional $ - (tok (Reserved <$> lit "@") <+> (CP.space *> annotations)) - ) + DocSourceElement + <$> (typeLink <|> termLink) + <*> ( fmap (fromMaybe []) . P.optional $ + (lit "@") *> (CP.space *> annotations) + ) where - annotation = tok identifierLexemeP <|> expr <* CP.space + annotation = fmap Left (tokenP identifierP) <|> fmap Right expr <* CP.space annotations = - join <$> P.some (wrap "syntax.docEmbedAnnotation" annotation) - src' name atName = wrap name $ do + P.some (DocEmbedAnnotation <$> annotation) + src' name atName = fmap name $ do _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space - s <- P.sepBy1 srcElem comma + s <- sepBy1' srcElem comma _ <- lit "}" - pure (join s) - signature = wrap "syntax.docSignature" $ do + pure s + signature = fmap DocSignature $ do _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space - s <- join <$> P.sepBy1 signatureLink comma + s <- sepBy1' signatureLink comma _ <- lit "}" pure s - signatureInline = wrap "syntax.docSignatureInline" $ do + signatureInline = fmap DocSignatureInline $ do _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space s <- signatureLink _ <- lit "}" pure s - evalInline = wrap "syntax.docEvalInline" $ do + evalInline = fmap DocEvalInline $ do _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space let inlineEvalClose = [] <$ lit "}" s <- lexemes' inlineEvalClose pure s - typeLink = wrap "syntax.docEmbedTypeLink" do + typeLink = fmap DocEmbedTypeLink $ do _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tok identifierLexemeP <* CP.space + tokenP identifierP <* CP.space termLink = - wrap "syntax.docEmbedTermLink" $ - tok identifierLexemeP <* CP.space + fmap DocEmbedTermLink $ + tokenP identifierP <* CP.space signatureLink = - wrap "syntax.docEmbedSignatureLink" $ - tok identifierLexemeP <* CP.space + fmap DocEmbedSignatureLink $ + tokenP identifierP <* CP.space groupy closing p = do - Token p start stop <- tokenP p + Token p _ _ <- tokenP p after <- P.optional . P.try $ leafy closing pure $ case after of Nothing -> p Just after -> - [ Token (Open "syntax.docGroup") start stop', - Token (Open "syntax.docJoin") start stop' - ] - <> p - <> after - <> (take 2 $ repeat (Token Close stop' stop')) - where - stop' = maybe stop end (lastMay after) + DocGroup + . DocJoin + $ p + :| pure after verbatim = P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do @@ -595,17 +806,17 @@ docBody = join <$> P.many (sectionElem <* CP.space) let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed -- If it's a multi-line verbatim block we trim any whitespace representing -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - wrap "syntax.docVerbatim" $ - wrap "syntax.docWord" $ - pure [Token (Textual txt) start stop] + pure . DocVerbatim $ + DocWord $ + Token txt start stop else - wrap "syntax.docCode" $ - wrap "syntax.docWord" $ - pure [Token (Textual originalText) start stop] + pure . DocCode $ + DocWord $ + Token originalText start stop exampleInline = P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ - wrap "syntax.docExample" $ do + fmap DocExample $ do n <- P.try $ do _ <- lit "`" length <$> P.takeWhile1P (Just "backticks") (== '`') @@ -613,19 +824,19 @@ docBody = join <$> P.many (sectionElem <* CP.space) ex <- CP.space *> lexemes' end pure ex - docClose = [] <$ lit "}}" + docClose = [] <$ docClose' docOpen = [] <$ lit "{{" link = P.label "link (examples: {type List}, {Nat.+})" $ - wrap "syntax.docLink" $ + fmap DocLink $ P.try $ lit "{" *> (typeLink <|> termLink) <* lit "}" expr = - P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ + fmap DocTransclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ openAs "{{" "syntax.docTransclude" - <+> do + *> do env0 <- S.get -- we re-allow layout within a transclusion, then restore it to its -- previous state after @@ -637,7 +848,7 @@ docBody = join <$> P.many (sectionElem <* CP.space) ts <- lexemes' (P.lookAhead ([] <$ lit "}}")) S.modify (\env -> env {inLayout = inLayout env0}) pure ts - <+> close ["syntax.docTransclude"] (lit "}}") + <* close ["syntax.docTransclude"] (lit "}}") nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace @@ -654,7 +865,7 @@ docBody = join <$> P.many (sectionElem <* CP.space) P.label "block eval (syntax: a fenced code block)" $ evalUnison <|> exampleBlock <|> other where - evalUnison = wrap "syntax.docEval" $ do + evalUnison = fmap (wrap' . DocEval) $ do -- commit after seeing that ``` is on its own line fence <- P.try $ do fence <- lit "```" <+> P.takeWhileP Nothing (== '`') @@ -665,7 +876,7 @@ docBody = join <$> P.many (sectionElem <* CP.space) (\env -> env {inLayout = True, opening = Just "docEval"}) (restoreStack "docEval" $ lexemes' ([] <$ lit fence)) - exampleBlock = wrap "syntax.docExampleBlock" $ do + exampleBlock = fmap (wrap' . DocExampleBlock) $ do void $ lit "@typecheck" <* CP.space fence <- lit "```" <+> P.takeWhileP Nothing (== '`') local @@ -682,20 +893,20 @@ docBody = join <$> P.many (sectionElem <* CP.space) skip _ s = s in List.intercalate "\n" $ skip column <$> lines s - other = wrap "syntax.docCodeBlock" $ do + other = fmap (uncurry $ wrapSimple2 DocCodeBlock) $ do column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel let tabWidth = toInteger . P.unPos $ P.defaultTabWidth fence <- lit "```" <+> P.takeWhileP Nothing (== '`') name <- P.takeWhileP Nothing nonNewlineSpace - *> tok (Textual <$> P.takeWhile1P Nothing (not . isSpace)) + *> tokenP (P.takeWhile1P Nothing (not . isSpace)) <* P.takeWhileP Nothing nonNewlineSpace _ <- void CP.eol verbatim <- - tok $ - Textual . uncolumn column tabWidth . trimAroundDelimiters + tokenP $ + uncolumn column tabWidth . trimAroundDelimiters <$> P.someTill P.anySingle ([] <$ lit fence) - pure (name <> verbatim) + pure (name, verbatim) boldOrItalicOrStrikethrough closing = do let start = @@ -705,30 +916,29 @@ docBody = join <$> P.many (sectionElem <* CP.space) (P.satisfy (== '~')) name s = if take 1 s == "~" - then "syntax.docStrikethrough" - else if take 1 s == "*" then "syntax.docBold" else "syntax.docItalic" + then DocStrikethrough + else if take 1 s == "*" then DocBold else DocItalic end <- P.try $ do end <- start P.lookAhead (P.satisfy (not . isSpace)) pure end - wrap (name end) . wrap "syntax.docParagraph" $ - join - <$> P.someTill - (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) - (lit end) + name end . wrap' . DocParagraph + <$> someTill' + (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) + (lit end) externalLink = P.label "hyperlink (example: [link name](https://destination.com))" $ - wrap "syntax.docNamedLink" $ do + fmap (uncurry DocNamedLink) $ do _ <- lit "[" p <- leafies (void $ char ']') _ <- lit "]" _ <- lit "(" target <- - wrap "syntax.docGroup" . wrap "syntax.docJoin" $ - link <|> fmap join (P.some (expr <|> wordy (char ')'))) + fmap (DocGroup . DocJoin) $ + fmap pure link <|> some' (expr <|> wordy (char ')')) _ <- lit ")" - pure (p <> target) + pure (p, target) -- newline = P.optional (lit "\r") *> lit "\n" @@ -742,15 +952,15 @@ docBody = join <$> P.many (sectionElem <* CP.space) where ok s = length [() | '\n' <- s] < 2 - spaced p = P.some (p <* P.optional sp) - leafies close = wrap "syntax.docParagraph" $ join <$> spaced (leafy close) + spaced p = some' (p <* P.optional sp) + leafies close = wrap' . DocParagraph <$> spaced (leafy close) list = bulletedList <|> numberedList - bulletedList = wrap "syntax.docBulletedList" $ join <$> P.sepBy1 bullet listSep - numberedList = wrap "syntax.docNumberedList" $ join <$> P.sepBy1 numberedItem listSep + bulletedList = wrap' . DocBulletedList <$> sepBy1' bullet listSep + numberedList = wrap' . DocNumberedList <$> sepBy1' numberedItem listSep - listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (bulletedStart <|> numberedStart) + listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) bulletedStart = P.try $ do r <- listItemStart' $ [] <$ P.satisfy bulletChar @@ -759,6 +969,7 @@ docBody = join <$> P.many (sectionElem <* CP.space) where bulletChar ch = ch == '*' || ch == '-' || ch == '+' + listItemStart' :: P a -> P (Int, a) listItemStart' gutter = P.try $ do nonNewlineSpaces col <- column <$> posP @@ -767,14 +978,11 @@ docBody = join <$> P.many (sectionElem <* CP.space) (col,) <$> gutter numberedStart = - listItemStart' $ P.try (tok . fmap num $ LP.decimal <* lit ".") - where - num :: Word -> Lexeme - num n = Numeric (show n) + listItemStart' $ P.try (tokenP $ LP.decimal <* lit ".") - listItemParagraph = wrap "syntax.docParagraph" $ do + listItemParagraph = fmap (wrap' . DocParagraph) $ do col <- column <$> posP - join <$> P.some (leaf <* sep col) + some' (leaf <* sep col) where -- Trickiness here to support hard line breaks inside of -- a bulleted list, so for instance this parses as expected: @@ -792,29 +1000,29 @@ docBody = join <$> P.many (sectionElem <* CP.space) *> do col2 <- column <$> posP guard $ col2 >= col - (P.notFollowedBy $ numberedStart <|> bulletedStart) + (P.notFollowedBy $ void numberedStart <|> void bulletedStart) pure () numberedItem = P.label msg $ do (col, s) <- numberedStart - pure s - <+> ( wrap "syntax.docColumn" $ do + (s,) + <$> ( fmap (uncurry DocColumn) $ do p <- nonNewlineSpaces *> listItemParagraph subList <- local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) - pure (p <> fromMaybe [] subList) + pure (p, subList) ) where msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" - bullet = wrap "syntax.docColumn" . P.label "bullet (examples: * item1, - item2)" $ do + bullet = fmap (uncurry DocColumn) . P.label "bullet (examples: * item1, - item2)" $ do (col, _) <- bulletedStart p <- nonNewlineSpaces *> listItemParagraph subList <- local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) - pure (p <> fromMaybe [] subList) + pure (p, subList) newline = P.label "newline" $ lit "\n" <|> lit "\r\n" @@ -828,8 +1036,8 @@ docBody = join <$> P.many (sectionElem <* CP.space) -- A paragraph under this subsection. -- # A section title (not a subsection) - section :: P [Token Lexeme] - section = wrap "syntax.docSection" $ do + section :: P DocTree + section = fmap (wrap' . uncurry DocSection) $ do ns <- S.gets parentSections hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp title <- paragraph <* CP.space @@ -837,19 +1045,13 @@ docBody = join <$> P.many (sectionElem <* CP.space) body <- local (\env -> env {parentSections = (m : (tail ns))}) $ P.many (sectionElem <* CP.space) - pure $ title <> join body + pure $ (title, body) - wrap :: String -> P [Token Lexeme] -> P [Token Lexeme] - wrap o p = do - start <- posP - lexemes <- p - pure $ go start lexemes - where - go start [] = [Token (Open o) start start, Token Close start start] - go start ts@(Token _ x _ : _) = - Token (Open o) start x : (ts ++ [Token Close (end final) (end final)]) - where - final = last ts + wrap' :: DocTop DocTree -> DocTree + wrap' doc = ann doc :< doc + + wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> DocTop DocTree) -> a -> b -> DocTree + wrapSimple2 fn a b = ann a <> ann b :< fn a b lexemes' :: P [Token Lexeme] -> P [Token Lexeme] lexemes' eof = @@ -1289,12 +1491,13 @@ identifierP = do -- .foo.++.doc -- `.`.`..` (This is a two-segment identifier without a leading dot: "." then "..") identifierLexemeP :: P Lexeme -identifierLexemeP = do - name <- identifierP - pure - if Name.isSymboly (HQ'.toName name) - then SymbolyId name - else WordyId name +identifierLexemeP = identifierLexeme <$> identifierP + +identifierLexeme :: HQ'.HashQualified Name -> Lexeme +identifierLexeme name = + if Name.isSymboly (HQ'.toName name) + then SymbolyId name + else WordyId name wordyIdSegP :: P NameSegment wordyIdSegP = diff --git a/unison-syntax/unison-syntax.cabal b/unison-syntax/unison-syntax.cabal index 888982134f..4b097e6021 100644 --- a/unison-syntax/unison-syntax.cabal +++ b/unison-syntax/unison-syntax.cabal @@ -69,6 +69,7 @@ library , containers , cryptonite , extra + , free , lens , megaparsec , mtl @@ -127,6 +128,7 @@ test-suite syntax-tests , cryptonite , easytest , extra + , free , lens , megaparsec , mtl From 227ff27cea21af3a636bb94d0ccadc80fcd37d9f Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Fri, 5 Jul 2024 14:15:05 -0600 Subject: [PATCH 05/22] =?UTF-8?q?Don=E2=80=99t=20=E2=80=9Cun-parse?= =?UTF-8?q?=E2=80=9D=20`Doc`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This removes the layer that makes the `Doc` parser look like a lexer and replaces it with a function that converts the Doc structure directly Unison Terms. --- .../src/Unison/Syntax/TermParser.hs | 301 +++++++++--------- unison-syntax/src/Unison/Syntax/Lexer.hs | 103 ++---- unison-syntax/src/Unison/Syntax/Parser.hs | 9 +- 3 files changed, 178 insertions(+), 235 deletions(-) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 635a974d89..1f1dda24c1 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -12,6 +12,7 @@ module Unison.Syntax.TermParser ) where +import Control.Comonad.Cofree (Cofree ((:<))) import Control.Monad.Reader (asks, local) import Data.Char qualified as Char import Data.Foldable (foldrM) @@ -24,6 +25,7 @@ import Data.Sequence qualified as Sequence import Data.Set qualified as Set import Data.Text qualified as Text import Data.Tuple.Extra qualified as TupleE +import Data.Void (vacuous) import Text.Megaparsec qualified as P import U.Core.ABT qualified as ABT import Unison.ABT qualified as ABT @@ -38,7 +40,7 @@ import Unison.NameSegment qualified as NameSegment import Unison.Names (Names) import Unison.Names qualified as Names import Unison.NamesWithHistory qualified as Names -import Unison.Parser.Ann (Ann) +import Unison.Parser.Ann (Ann (Ann)) import Unison.Parser.Ann qualified as Ann import Unison.Pattern (Pattern) import Unison.Pattern qualified as Pattern @@ -113,8 +115,10 @@ rewriteBlock = do pure (DD.rewriteType (ann kw <> ann rhs) (L.payload <$> vs) lhs rhs) typeLink' :: (Monad m, Var v) => P v m (L.Token Reference) -typeLink' = do - id <- hqPrefixId +typeLink' = findUniqueType =<< hqPrefixId + +findUniqueType :: (Monad m, Var v) => L.Token (HQ.HashQualified Name) -> P v m (L.Token Reference) +findUniqueType id = do ns <- asks names case Names.lookupHQType Names.IncludeSuffixes (L.payload id) ns of s @@ -434,7 +438,7 @@ resolveHashQualified tok = do names <- asks names case L.payload tok of HQ.NameOnly n -> pure $ Term.var (ann tok) (Name.toVar n) - _ -> case Names.lookupHQTerm Names.IncludeSuffixes (L.payload tok) names of + hqn -> case Names.lookupHQTerm Names.IncludeSuffixes hqn names of s | Set.null s -> failCommitted $ UnknownTerm tok s | Set.size s > 1 -> failCommitted $ UnknownTerm tok s @@ -461,160 +465,155 @@ termLeaf = doc2Block <&> \(spanAnn, trm) -> trm {ABT.annotation = ABT.annotation trm <> spanAnn} ] --- Syntax for documentation v2 blocks, which are surrounded by {{ }}. +-- | Gives a parser an explicit stream to parse, so that it consumes nothing from the original stream when it runs. +-- +-- This is used inside the `Doc` -> `Term` conversion, where we have chunks of Unison code embedded that need to be +-- parsed. It’s a consequence of parsing Doc in the midst of the Unison lexer. +subParse :: (Ord v, Monad m) => P v m a -> [L.Token L.Lexeme] -> P v m a +subParse p toks = do + orig <- P.getInput + P.setInput $ Input toks + result <- p <* P.eof + P.setInput orig + pure result + +-- | Syntax for documentation v2 blocks, which are surrounded by @{{@ @}}@. -- The lexer does most of the heavy lifting so there's not a lot for -- the parser to do. For instance, in -- --- {{ --- Hi there! --- --- goodbye. --- }} +-- > {{ +-- > Hi there! +-- > +-- > goodbye. +-- > }} -- -- the lexer will produce: -- --- [Open "syntax.docUntitledSection", --- Open "syntax.docParagraph", --- Open "syntax.docWord", Textual "Hi", Close, --- Open "syntax.docWord", Textual "there!", Close, --- Close --- Open "syntax.docParagraph", --- Open "syntax.docWord", Textual "goodbye", Close, --- Close --- Close] +-- > [ Doc +-- > ( DocUntitledSection +-- > (DocParagraph (DocWord "Hi" :| [DocWord "there!"])) +-- > (DocParagraph (DocWord "goodbye" :| [])) +-- > ) +-- > ] -- -- The parser will parse this into the Unison expression: -- --- syntax.docUntitledSection [ --- syntax.docParagraph [syntax.docWord "Hi", syntax.docWord "there!"], --- syntax.docParagraph [syntax.docWord "goodbye"] --- ] +-- > syntax.docUntitledSection [ +-- > syntax.docParagraph [syntax.docWord "Hi", syntax.docWord "there!"], +-- > syntax.docParagraph [syntax.docWord "goodbye"] +-- > ] -- --- Where `syntax.doc{Paragraph, UntitledSection,...}` are all ordinary term +-- Where @syntax.doc{Paragraph, UntitledSection,...}@ are all ordinary term -- variables that will be looked up in the environment like anything else. This -- means that the documentation syntax can have its meaning changed by --- overriding what functions the names `syntax.doc*` correspond to. +-- overriding what functions the names @syntax.doc*@ correspond to. doc2Block :: forall m v. (Monad m, Var v) => P v m (Ann {- Annotation for the whole spanning block -}, Term v Ann) doc2Block = do - P.lookAhead (openBlockWith "syntax.docUntitledSection") *> elem + L.Token docContents startDoc endDoc <- doc + let docAnn = Ann startDoc endDoc + (docAnn,) . docUntitledSection (gann docAnn) <$> traverse (cata $ docTop <=< sequenceA) docContents where - -- For terms which aren't blocks the spanning annotation is the same as the - -- term annotation. - selfAnnotated :: Term v Ann -> (Ann, Term v Ann) - selfAnnotated t = (ann t, t) - elem :: P v m (Ann {- Annotation for the whole spanning block -}, Term v Ann) - elem = - (selfAnnotated <$> text) <|> do - startTok <- openBlock - let -- here, `t` will be something like `Open "syntax.docWord"` - -- so `f` will be a term var with the name "syntax.docWord". - f = f' startTok - f' t = Term.var (ann t) (Var.nameds (L.payload t)) - - -- follows are some common syntactic forms used for parsing child elements - - -- regular is parsed into `f child1 child2 child3` for however many children - regular = do - cs <- P.many (snd <$> elem) - endTok <- closeBlock - let trm = Term.apps' f cs - pure (ann startTok <> ann endTok, trm) - - -- variadic is parsed into: `f [child1, child2, ...]` - variadic = variadic' f - variadic' f = do - cs <- P.many (snd <$> elem) - endTok <- closeBlock - let trm = Term.apps' f [Term.list (ann cs) cs] - pure (ann startTok <> ann endTok, trm) - - -- sectionLike is parsed into: `f tm [child1, child2, ...]` - sectionLike = do - arg1 <- (snd <$> elem) - cs <- P.many (snd <$> elem) - endTok <- closeBlock - let trm = Term.apps' f [arg1, Term.list (ann cs) cs] - pure (ann startTok <> ann endTok, trm) - - evalLike wrap = do - tm <- term - endTok <- closeBlock - let trm = Term.apps' f [wrap tm] - pure (ann startTok <> ann endTok, trm) - - -- converts `tm` to `'tm` - -- - -- Embedded examples like ``1 + 1`` are represented as terms, - -- but are wrapped in delays so they are left unevaluated for the - -- code which renders documents. (We want the doc display to get - -- the unevaluated expression `1 + 1` and not `2`) - addDelay tm = Term.delay (ann tm) tm - case L.payload startTok of - "syntax.docJoin" -> variadic - "syntax.docUntitledSection" -> variadic - "syntax.docColumn" -> variadic - "syntax.docParagraph" -> variadic - "syntax.docSignature" -> variadic - "syntax.docSource" -> variadic - "syntax.docFoldedSource" -> variadic - "syntax.docBulletedList" -> variadic - "syntax.docSourceAnnotations" -> variadic - "syntax.docSourceElement" -> do - link <- (snd <$> elem) - anns <- P.optional $ reserved "@" *> (snd <$> elem) - endTok <- closeBlock - let trm = Term.apps' f [link, fromMaybe (Term.list (ann link) mempty) anns] - pure (ann startTok <> ann endTok, trm) - "syntax.docNumberedList" -> do - nitems@((n, _) : _) <- P.some nitem - endTok <- closeBlock - let items = snd <$> nitems - let trm = Term.apps' f [n, Term.list (ann items) items] - pure (ann startTok <> ann endTok, trm) - where - nitem = do - n <- number - t <- openBlockWith "syntax.docColumn" - let f = f' ("syntax.docColumn" <$ t) - (_spanAnn, child) <- variadic' f - pure (n, child) - "syntax.docSection" -> sectionLike - -- @source{ type Blah, foo, type Bar } - "syntax.docEmbedTermLink" -> do - tm <- addDelay <$> (hashQualifiedPrefixTerm <|> hashQualifiedInfixTerm) - endTok <- closeBlock - let trm = Term.apps' f [tm] - pure (ann startTok <> ann endTok, trm) - "syntax.docEmbedSignatureLink" -> do - tm <- addDelay <$> (hashQualifiedPrefixTerm <|> hashQualifiedInfixTerm) - endTok <- closeBlock - let trm = Term.apps' f [tm] - pure (ann startTok <> ann endTok, trm) - "syntax.docEmbedTypeLink" -> do - r <- typeLink' - endTok <- closeBlock - let trm = Term.apps' f [Term.typeLink (ann r) (L.payload r)] - pure (ann startTok <> ann endTok, trm) - "syntax.docExample" -> do - trm <- term - endTok <- closeBlock - let spanAnn = ann startTok <> ann endTok - pure . (spanAnn,) $ case trm of - tm@(Term.Apps' _ xs) -> - let fvs = List.Extra.nubOrd $ concatMap (toList . Term.freeVars) xs - n = Term.nat (ann tm) (fromIntegral (length fvs)) - lam = addDelay $ Term.lam' (ann tm) ((Ann.GeneratedFrom spanAnn,) <$> fvs) tm - in Term.apps' f [n, lam] - tm -> Term.apps' f [Term.nat (ann tm) 0, addDelay tm] - "syntax.docTransclude" -> evalLike id - "syntax.docEvalInline" -> evalLike addDelay - "syntax.docExampleBlock" -> do - (spanAnn, tm) <- block'' False True "syntax.docExampleBlock" (pure (void startTok)) closeBlock - pure $ (spanAnn, Term.apps' f [Term.nat (ann tm) 0, addDelay tm]) - "syntax.docEval" -> do - (spanAnn, tm) <- block' False "syntax.docEval" (pure (void startTok)) closeBlock - pure $ (spanAnn, Term.apps' f [addDelay tm]) - _ -> regular + cata :: (Functor f) => (f a -> a) -> Cofree f x -> a + cata fn (_ :< fx) = fn $ cata fn <$> fx + + gann :: (Annotated a) => a -> Ann + gann = Ann.GeneratedFrom . ann + + addDelay :: Term v Ann -> Term v Ann + addDelay tm = Term.delay (ann tm) tm + + f :: (Annotated a) => a -> String -> Term v Ann + f a = Term.var (gann a) . Var.nameds . ("syntax.doc" <>) + + docUntitledSection :: Ann -> L.DocUntitledSection (Term v Ann) -> Term v Ann + docUntitledSection ann (L.DocUntitledSection tops) = + Term.app ann (f ann "UntitledSection") $ Term.list (gann tops) tops + + docTop :: L.DocTop (Term v Ann) -> TermP v m + docTop d = case d of + L.DocSection title body -> pure $ Term.apps' (f d "Section") [title, Term.list (gann body) body] + L.DocEval code -> + Term.app (gann d) (f d "Eval") . addDelay . snd + <$> subParse (block' False False "syntax.docEval" (pure $ pure ()) $ Ann.External <$ P.eof) code + L.DocExampleBlock code -> + Term.apps' (f d "ExampleBlock") . (Term.nat (gann d) 0 :) . pure . addDelay . snd + <$> subParse (block' False True "syntax.docExampleBlock" (pure $ pure ()) $ Ann.External <$ P.eof) code + L.DocCodeBlock label body -> + pure $ + Term.apps' + (f d "CodeBlock") + [Term.text (ann label) . Text.pack $ L.payload label, Term.text (ann body) . Text.pack $ L.payload body] + L.DocBulletedList items -> + pure $ Term.app (gann d) (f d "BulletedList") . Term.list (gann items) . toList $ docColumn <$> items + L.DocNumberedList items@((n, _) :| _) -> + pure $ + Term.apps' + (f d "NumberedList") + [Term.nat (ann d) $ L.payload n, Term.list (gann $ snd <$> items) . toList $ docColumn . snd <$> items] + L.DocParagraph leaves -> + Term.app (gann d) (f d "Paragraph") . Term.list (ann leaves) . toList <$> traverse docLeaf leaves + + docColumn :: L.DocColumn (Term v Ann) -> Term v Ann + docColumn d@(L.DocColumn para sublist) = + Term.app (gann d) (f d "Column") . Term.list (gann d) $ para : toList sublist + + docLeaf :: L.DocLeaf (Term v Ann) -> TermP v m + docLeaf d = case d of + L.DocLink link -> Term.app (gann d) (f d "Link") <$> docEmbedLink link + L.DocNamedLink para target -> Term.apps' (f d "NamedLink") . (para :) . pure <$> docLeaf (vacuous target) + L.DocExample code -> do + trm <- subParse term code + pure . Term.apps' (f d "Example") $ case trm of + tm@(Term.Apps' _ xs) -> + let fvs = List.Extra.nubOrd $ concatMap (toList . Term.freeVars) xs + n = Term.nat (ann tm) (fromIntegral (length fvs)) + lam = addDelay $ Term.lam' (ann tm) ((mempty,) <$> fvs) tm + in [n, lam] + tm -> [Term.nat (ann tm) 0, addDelay tm] + L.DocTransclude code -> Term.app (gann d) (f d "Transclude") <$> subParse term code + L.DocBold para -> pure $ Term.app (gann d) (f d "Bold") para + L.DocItalic para -> pure $ Term.app (gann d) (f d "Italic") para + L.DocStrikethrough para -> pure $ Term.app (gann d) (f d "Strikethrough") para + L.DocVerbatim leaf -> Term.app (gann d) (f d "Verbatim") <$> docLeaf (vacuous leaf) + L.DocCode leaf -> Term.app (gann d) (f d "Code") <$> docLeaf (vacuous leaf) + L.DocSource elems -> + Term.app (gann d) (f d "Source") . Term.list (ann elems) . toList <$> traverse docSourceElement elems + L.DocFoldedSource elems -> + Term.app (gann d) (f d "FoldedSource") . Term.list (ann elems) . toList <$> traverse docSourceElement elems + L.DocEvalInline code -> Term.app (gann d) (f d "EvalInline") . addDelay <$> subParse term code + L.DocSignature links -> + Term.app (gann d) (f d "Signature") . Term.list (ann links) . toList <$> traverse docEmbedSignatureLink links + L.DocSignatureInline link -> Term.app (gann d) (f d "SignatureInline") <$> docEmbedSignatureLink link + L.DocWord txt -> pure . Term.app (gann d) (f d "Word") . Term.text (ann txt) . Text.pack $ L.payload txt + L.DocGroup (L.DocJoin leaves) -> + Term.app (gann d) (f d "Group") . Term.app (gann d) (f d "Join") . Term.list (ann leaves) . toList + <$> traverse docLeaf leaves + + docEmbedLink :: L.DocEmbedLink -> TermP v m + docEmbedLink d = case d of + L.DocEmbedTypeLink ident -> + Term.app (gann d) (f d "EmbedTypeLink") . Term.typeLink (ann d) . L.payload + <$> findUniqueType (HQ'.toHQ <$> ident) + L.DocEmbedTermLink ident -> + Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) + + docSourceElement :: L.DocSourceElement -> TermP v m + docSourceElement d@(L.DocSourceElement link anns) = do + link' <- docEmbedLink link + anns' <- traverse docEmbedAnnotation anns + pure $ Term.apps' (f d "SourceElement") [link', Term.list (ann anns) anns'] + + docEmbedSignatureLink :: L.DocEmbedSignatureLink -> TermP v m + docEmbedSignatureLink d@(L.DocEmbedSignatureLink ident) = + Term.app (gann d) (f d "EmbedSignatureLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) + + docEmbedAnnotation :: L.DocEmbedAnnotation -> TermP v m + docEmbedAnnotation d@(L.DocEmbedAnnotation a) = + -- This is the only place I’m not sure we’re doing the right thing. In the lexer, this can be an identifier or a + -- DocLeaf, but here it could be either /text/ or a Doc element. And I don’t think there’s any way the lexemes + -- produced for an identifier and the lexemes consumed for text line up. So, I think this is a bugfix I can’t + -- avoid. + Term.app (gann d) (f d "EmbedAnnotation") <$> either (resolveHashQualified . fmap HQ'.toHQ) (docLeaf . vacuous) a docBlock :: (Monad m, Var v) => TermP v m docBlock = do @@ -1143,7 +1142,7 @@ customFailure :: (P.MonadParsec e s m) => e -> m a customFailure = P.customFailure block :: forall m v. (Monad m, Var v) => String -> P v m (Ann, Term v Ann) -block s = block' False s (openBlockWith s) closeBlock +block s = block' False False s (openBlockWith s) closeBlock -- example: use Foo.bar.Baz + ++ x -- + ++ and x are called the "suffixes" of the `use` statement, and @@ -1213,24 +1212,16 @@ substImports ns imports = ] block' :: - (Monad m, Var v) => - IsTop -> - String -> - P v m (L.Token ()) -> - P v m (L.Token ()) -> - P v m (Ann {- ann which spans the whole block -}, Term v Ann) -block' isTop = block'' isTop False - -block'' :: forall m v end. (Monad m, Var v, Annotated end) => IsTop -> - Bool -> -- `True` means insert `()` at end of block if it ends with a statement + -- | `True` means insert `()` at end of block if it ends with a statement + Bool -> String -> P v m (L.Token ()) -> P v m end -> P v m (Ann {- ann which spans the whole block -}, Term v Ann) -block'' isTop implicitUnitAtEnd s openBlock closeBlock = do +block' isTop implicitUnitAtEnd s openBlock closeBlock = do open <- openBlock (names, imports) <- imports _ <- optional semi diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index fd27118050..3c5041cf6f 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -42,6 +42,7 @@ import Control.Comonad.Cofree (Cofree ((:<))) import Control.Monad.State qualified as S import Data.Char (isAlphaNum, isControl, isDigit, isSpace, ord, toLower) import Data.Foldable qualified as Foldable +import Data.Functor.Classes import Data.List qualified as List import Data.List.Extra qualified as List import Data.List.NonEmpty (NonEmpty ((:|))) @@ -50,7 +51,6 @@ import Data.List.NonEmpty qualified as NonEmpty import Data.Map.Strict qualified as Map import Data.Set qualified as Set import Data.Text qualified as Text -import Data.Void (vacuous) import GHC.Exts (sortWith) import Text.Megaparsec qualified as P import Text.Megaparsec.Char (char) @@ -65,7 +65,7 @@ import Unison.Name qualified as Name import Unison.NameSegment (NameSegment) import Unison.NameSegment qualified as NameSegment (docSegment) import Unison.NameSegment.Internal qualified as NameSegment -import Unison.Parser.Ann (Ann (Ann, GeneratedFrom), Annotated (..)) +import Unison.Parser.Ann (Ann (Ann), Annotated (..)) import Unison.Prelude import Unison.ShortHash (ShortHash) import Unison.ShortHash qualified as SH @@ -158,6 +158,7 @@ data Lexeme | Bytes Bytes.Bytes -- bytes literals | Hash ShortHash -- hash literals | Err Err + | Doc (DocUntitledSection DocTree) deriving stock (Eq, Show, Ord) type IsVirtual = Bool -- is it a virtual semi or an actual semi? @@ -389,6 +390,7 @@ displayLexeme = \case Bytes _b -> "bytes literal" Hash h -> Text.unpack (SH.toText h) Err e -> show e + Doc _ -> "doc structure" infixl 2 <+> @@ -436,7 +438,7 @@ doc2 = do CP.space env0 <- S.get -- Disable layout while parsing the doc block and reset the section number - (docToks, closeTok) <- local + (docTok, closeTok) <- local ( \env -> env { inLayout = False, @@ -444,16 +446,18 @@ doc2 = do } ) do - bodyToks <- docBody (lit "}}") + body <- docBody (lit "}}") closeStart <- posP lit "}}" closeEnd <- posP - pure (docToLexemes (openStart, closeEnd) bodyToks, Token Close closeStart closeEnd) + pure (Token (Doc body) openStart closeEnd, Token Close closeStart closeEnd) -- Parse any layout tokens after the doc block, e.g. virtual semicolon endToks <- token' ignore (pure ()) -- Hack to allow anonymous doc blocks before type decls -- {{ Some docs }} Foo.doc = {{ Some docs }} -- ability Foo where => ability Foo where + -- + -- __FIXME__: This should be done _after_ parsing, not in lexing. tn <- subsequentTypeName pure $ beforeStartToks <> case (tn) of @@ -462,12 +466,13 @@ doc2 = do | isTopLevel -> Token (WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment))) openStart openEnd : Token (Open "=") openStart openEnd - : docToks - -- We need an extra 'Close' here because we added an extra Open above. - <> (closeTok : endToks) + : docTok + -- We need an extra 'Close' here because we added an extra Open above. + : closeTok + : endToks where isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 - _ -> docToks <> endToks + _ -> docTok : endToks where -- DUPLICATED wordyKw kw = separated wordySep (lit kw) @@ -527,6 +532,15 @@ data DocTop a | DocParagraph (NonEmpty (DocLeaf a)) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) +instance Eq1 DocTop where + liftEq _ _ _ = True + +instance Ord1 DocTop where + liftCompare _ _ _ = LT + +instance Show1 DocTop where + liftShowsPrec _ _ _ _ x = x + data DocColumn a = -- | The first is always a Paragraph, and the second a Bulleted or Numbered List DocColumn a (Maybe a) @@ -625,76 +639,6 @@ instance Annotated DocEmbedSignatureLink where instance Annotated DocEmbedAnnotation where ann (DocEmbedAnnotation a) = either ann ann a --- | This is a short-term hack to turn our parse tree back into the sequence of lexemes the current parser expects. --- --- The medium-term solution is to preserve @[`DocTree`]@ as its own lexeme type, and hand it to the parser without --- flattening it back to tokens. Longer-term, maybe we add a real lexer for @Doc@, and then whatever is left of this --- parser moves into the actual parser. -docToLexemes :: (Pos, Pos) -> DocUntitledSection DocTree -> [Token Lexeme] -docToLexemes (startDoc, endDoc) (DocUntitledSection tops) = - Token (Open "syntax.docUntitledSection") startDoc startDoc - : concatMap cata tops <> pure (Token Close endDoc endDoc) - where - wrap :: Ann -> String -> [Token Lexeme] -> [Token Lexeme] - wrap ann suffix lexemes = go (extractStart ann) lexemes - where - extractStart = \case - Ann start _ -> start - GeneratedFrom a -> extractStart a - a -> error $ "expected a good Pos! Got: " <> show a - o = "syntax.doc" <> suffix - go start [] = [Token (Open o) start start, Token Close start start] - go start ts@(Token _ x _ : _) = - Token (Open o) start x : (ts ++ [Token Close (end final) (end final)]) - where - final = last ts - cata :: DocTree -> [Token Lexeme] - cata (a :< top) = docTop a $ cata <$> top - docTop start = \case - DocSection title body -> wrap start "Section" $ title <> join body - DocEval code -> wrap start "Eval" code - DocExampleBlock code -> wrap start "ExampleBlock" code - DocCodeBlock label text -> wrap start "CodeBlock" [Textual <$> label, Textual <$> text] - DocBulletedList items -> wrap start "BulletedList" . concat $ (\col -> docColumn (ann col) col) <$> items - DocNumberedList items -> - wrap start "NumberedList" . concat $ - uncurry (:) . bimap (Numeric . show <$>) (\col -> docColumn (ann col) col) - <$> items - DocParagraph body -> wrap start "Paragraph" . concat $ (\l -> docLeaf (ann l) l) <$> body - docColumn start (DocColumn para mlist) = wrap start "Column" $ foldr (flip (<>)) para mlist - docLeaf start = \case - DocLink link -> wrap start "Link" $ docEmbedLink (ann link) link - DocNamedLink name target -> wrap start "NamedLink" $ name <> docLeaf (ann target) (vacuous target) - DocExample code -> wrap start "Example" code - DocTransclude code -> wrap start "Transclude" code - DocBold para -> wrap start "Bold" para - DocItalic para -> wrap start "Italic" para - DocStrikethrough para -> wrap start "Strikethrough" para - DocVerbatim word -> wrap start "Verbatim" . docLeaf (ann word) $ vacuous word - DocCode word -> wrap start "Code" . docLeaf (ann word) $ vacuous word - DocSource elems -> wrap start "Source" . concat $ (\e -> docSourceElement (ann e) e) <$> elems - DocFoldedSource elems -> wrap start "FoldedSource" . concat $ (\e -> docSourceElement (ann e) e) <$> elems - DocEvalInline code -> wrap start "EvalInline" code - DocSignature links -> wrap start "Signature" . concat $ (\l -> docEmbedSignatureLink (ann l) l) <$> links - DocSignatureInline link -> wrap start "SignatureInline" $ docEmbedSignatureLink (ann link) link - DocWord text -> wrap start "Word" . pure $ Textual <$> text - DocGroup (DocJoin leaves) -> - wrap start "Group" . wrap start "Join" . concat $ (\l -> docLeaf (ann l) l) <$> leaves - docEmbedLink start = \case - DocEmbedTypeLink ident -> wrap start "EmbedTypeLink" . pure $ identifierLexeme <$> ident - DocEmbedTermLink ident -> wrap start "EmbedTermLink" . pure $ identifierLexeme <$> ident - docSourceElement start (DocSourceElement link anns) = - wrap start "SourceElement" $ - docEmbedLink (ann link) link - <> maybe - [] - ((Token (Reserved "@") (Pos 0 0) (Pos 0 0) :) . concatMap (\a -> docEmbedAnnotation (ann a) a)) - (NonEmpty.nonEmpty anns) - docEmbedSignatureLink start (DocEmbedSignatureLink ident) = - wrap start "EmbedSignatureLink" . pure $ identifierLexeme <$> ident - docEmbedAnnotation start (DocEmbedAnnotation a) = - wrap start "EmbedAnnotation" $ either (pure . fmap identifierLexeme) (\l -> docLeaf (ann l) $ vacuous l) a - -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). docBody :: P end -> P (DocUntitledSection DocTree) @@ -1741,6 +1685,7 @@ instance P.VisualStream [Token Lexeme] where pretty Close = "" pretty (Semi True) = "" pretty (Semi False) = ";" + pretty (Doc d) = show d pad (Pos line1 col1) (Pos line2 col2) = if line1 == line2 then replicate (col2 - col1) ' ' diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 4945f4347e..733ecc93cf 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -5,7 +5,8 @@ module Unison.Syntax.Parser ( Annotated (..), Err, Error (..), - Input, + -- FIXME: Don’t export the data constructor + Input (..), P, ParsingEnv (..), UniqueName, @@ -16,6 +17,7 @@ module Unison.Syntax.Parser chainr1, character, closeBlock, + doc, failCommitted, failureIf, hqInfixId, @@ -393,6 +395,11 @@ string = queryToken getString getString (L.Textual s) = Just (Text.pack s) getString _ = Nothing +doc :: (Ord v) => P v m (L.Token (L.DocUntitledSection L.DocTree)) +doc = queryToken \case + L.Doc d -> pure d + _ -> Nothing + -- | Parses a tuple of 'a's, or a single parenthesized 'a' -- -- returns the result of combining elements with 'pair', alongside the annotation containing From 159ea3a433ad18a30af64984673fede127866def Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 22 Jul 2024 15:03:00 -0600 Subject: [PATCH 06/22] Extract `preParse` from `lexer` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After running the core of the lexer, the `lexer` function then does some work to turn the stream into a tree, and reorder some lexemes. It then throws away the tree structure. This is the first step of preserving the tree structure for the parser. It extracts the “pre-parser” from `lexer` so that it can eventually be used _after_ the lexer, rather than internally. This also moves `fixup` to be applied on each block as we reorder it, rather than across the entire stream at the end (since the goal is to not _have_ an entire stream any more). --- unison-syntax/src/Unison/Syntax/Lexer.hs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 3c5041cf6f..fd48d6abb3 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -1564,7 +1564,7 @@ stanzas = go [] -- Moves type and ability declarations to the front of the token stream -- and move `use` statements to the front of each block reorder :: [T (Token Lexeme)] -> [T (Token Lexeme)] -reorder = join . sortWith f . stanzas +reorder = foldr fixup [] . join . sortWith f . stanzas where f [] = 3 :: Int f (t0 : _) = case payload $ headToken t0 of @@ -1572,16 +1572,17 @@ reorder = join . sortWith f . stanzas Open typOrA | Set.member (Text.pack typOrA) typeOrAbility -> 1 Reserved "use" -> 0 _ -> 3 :: Int + -- after reordering can end up with trailing semicolon at the end of + -- a block, which we remove with this pass + fixup (payload . headToken -> Semi _) [] = [] + fixup tok tail = tok : tail + +-- | This turns the lexeme stream into a tree, reordering some lexeme subsequences. +preParse :: [Token Lexeme] -> T (Token Lexeme) +preParse = reorderTree reorder . tree lexer :: String -> String -> [Token Lexeme] -lexer scope rem = - let t = tree $ lexer0' scope rem - -- after reordering can end up with trailing semicolon at the end of - -- a block, which we remove with this pass - fixup ((payload -> Semi _) : t@(payload -> Close) : tl) = t : fixup tl - fixup [] = [] - fixup (h : t) = h : fixup t - in fixup . toList $ reorderTree reorder t +lexer scope = toList . preParse . lexer0' scope isDelayOrForce :: Char -> Bool isDelayOrForce op = op == '\'' || op == '!' From 32472bd9e01cb87ac310f375a199e44800c05b9a Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 24 Jul 2024 19:46:17 -0600 Subject: [PATCH 07/22] Allow EOF to close layout blocks This removes the need to pad the lexer stream with trailing `Close` lexemes. If EOF is reached, the parser will automatically close any layout blocks (but not context-free blocks). --- .../src/Unison/Syntax/TermParser.hs | 25 +++--- .../reparses-with-same-hash.u | 2 +- .../transcripts/error-messages.output.md | 2 + unison-syntax/src/Unison/Syntax/Lexer.hs | 87 ++++++++++--------- unison-syntax/src/Unison/Syntax/Parser.hs | 6 ++ 5 files changed, 71 insertions(+), 51 deletions(-) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 1f1dda24c1..999d5658ba 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -103,7 +103,7 @@ rewriteBlock = do rewriteTermlike kw mk = do kw <- quasikeyword kw lhs <- term - (_spanAnn, rhs) <- block "==>" + (_spanAnn, rhs) <- layoutBlock "==>" pure (mk (ann kw <> ann rhs) lhs rhs) rewriteTerm = rewriteTermlike "term" DD.rewriteTerm rewriteCase = rewriteTermlike "case" DD.rewriteCase @@ -164,13 +164,13 @@ match :: (Monad m, Var v) => TermP v m match = do start <- openBlockWith "match" scrutinee <- term - _ <- closeBlock + _ <- optionalCloseBlock _ <- P.try (openBlockWith "with") <|> do t <- anyToken P.customFailure (ExpectedBlockOpen "with" t) (_arities, cases) <- NonEmpty.unzip <$> matchCases1 start - _ <- closeBlock + _ <- optionalCloseBlock pure $ Term.match (ann start <> ann (NonEmpty.last cases)) @@ -212,10 +212,10 @@ matchCase = do [ Nothing <$ P.try (quasikeyword "otherwise"), Just <$> infixAppOrBooleanOp ] - (_spanAnn, t) <- block "->" + (_spanAnn, t) <- layoutBlock "->" pure (guard, t) let unguardedBlock = label "case match" do - (_spanAnn, t) <- block "->" + (_spanAnn, t) <- layoutBlock "->" pure (Nothing, t) -- a pattern's RHS is either one or more guards, or a single unguarded block. guardsAndBlocks <- guardedBlocks <|> (pure @[] <$> unguardedBlock) @@ -357,10 +357,10 @@ lam p = label "lambda" $ mkLam <$> P.try (some prefixDefinitionName <* reserved in Term.lam' (ann (head vs) <> ann b) annotatedArgs b letBlock, handle, ifthen :: (Monad m, Var v) => TermP v m -letBlock = label "let" $ (snd <$> block "let") +letBlock = label "let" $ (snd <$> layoutBlock "let") handle = label "handle" do (handleSpan, b) <- block "handle" - (_withSpan, handler) <- block "with" + (_withSpan, handler) <- layoutBlock "with" -- We don't use the annotation span from 'with' here because it will -- include a dedent if it's at the end of block. -- Meaning the newline gets overwritten when pretty-printing and it messes things up. @@ -377,7 +377,7 @@ lamCase = do start <- openBlockWith "cases" cases <- matchCases1 start (arity, cases) <- checkCasesArities cases - _ <- closeBlock + _ <- optionalCloseBlock lamvars <- replicateM arity (Parser.uniqueName 10) let vars = Var.named <$> [tweak v i | (v, i) <- lamvars `zip` [(1 :: Int) ..]] @@ -396,7 +396,7 @@ ifthen = label "if" do start <- peekAny (_spanAnn, c) <- block "if" (_spanAnn, t) <- block "then" - (_spanAnn, f) <- block "else" + (_spanAnn, f) <- layoutBlock "else" pure $ Term.iff (ann start <> ann f) c t f text :: (Var v) => TermP v m @@ -987,7 +987,7 @@ delayQuote = P.label "quote" do delayBlock :: (Monad m, Var v) => P v m (Ann {- Ann spanning the whole block -}, Term v Ann) delayBlock = P.label "do" do - (spanAnn, b) <- block "do" + (spanAnn, b) <- layoutBlock "do" let argSpan = (ann b {- would be nice to use the annotation for 'do' here, but it's not terribly important -}) pure $ (spanAnn, DD.delayTerm (ann b) argSpan b) @@ -1074,7 +1074,7 @@ destructuringBind = do let boundVars' = snd <$> boundVars _ <- P.lookAhead (openBlockWith "=") pure (p, boundVars') - (_spanAnn, scrute) <- block "=" -- Dwight K. Scrute ("The People's Scrutinee") + (_spanAnn, scrute) <- layoutBlock "=" -- Dwight K. Scrute ("The People's Scrutinee") let guard = Nothing let absChain vs t = foldr (\v t -> ABT.abs' (ann t) v t) t vs thecase t = Term.MatchCase p (fmap (absChain boundVars) guard) $ absChain boundVars t @@ -1144,6 +1144,9 @@ customFailure = P.customFailure block :: forall m v. (Monad m, Var v) => String -> P v m (Ann, Term v Ann) block s = block' False False s (openBlockWith s) closeBlock +layoutBlock :: forall m v. (Monad m, Var v) => String -> P v m (Ann, Term v Ann) +layoutBlock s = block' False False s (openBlockWith s) optionalCloseBlock + -- example: use Foo.bar.Baz + ++ x -- + ++ and x are called the "suffixes" of the `use` statement, and -- `Foo.bar.Baz` is called the prefix. A `use` statement has the effect diff --git a/unison-src/transcripts-round-trip/reparses-with-same-hash.u b/unison-src/transcripts-round-trip/reparses-with-same-hash.u index 98fbe28a57..5d75eff442 100644 --- a/unison-src/transcripts-round-trip/reparses-with-same-hash.u +++ b/unison-src/transcripts-round-trip/reparses-with-same-hash.u @@ -542,7 +542,7 @@ fix_4384d = {{ {{ docExampleBlock 0 '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17, fix_4384e = id : x -> x id x = x - {{ {{ docExampleBlock 0 (id id id id id id id id id id id id id id id id id id id id id (x -> 0) }} }} + {{ {{ docExampleBlock 0 (id id id id id id id id id id id id id id id id id id id id id (x -> 0)) }} }} fnApplicationSyntax = Environment.default = do 1 + 1 diff --git a/unison-src/transcripts/error-messages.output.md b/unison-src/transcripts/error-messages.output.md index 0b3e334aa6..03e7e652ac 100644 --- a/unison-src/transcripts/error-messages.output.md +++ b/unison-src/transcripts/error-messages.output.md @@ -290,6 +290,7 @@ x = match Some a with I was surprised to find a -> here. I was expecting one of these instead: + * end of input * newline or semicolon ``` @@ -312,6 +313,7 @@ x = match Some a with I was surprised to find a '|' here. I was expecting one of these instead: + * end of input * newline or semicolon ``` diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index fd48d6abb3..2ca3dc3738 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -288,9 +288,9 @@ showErrorFancy = \case GT -> "greater than " P.ErrorCustom a -> P.showErrorComponent a -lexer0' :: String -> String -> [Token Lexeme] -lexer0' scope rem = - case flip S.evalState env0 $ P.runParserT lexemes scope rem of +lexer :: String -> String -> [Token Lexeme] +lexer scope rem = + case flip S.evalState env0 $ P.runParserT (lexemes eof) scope rem of Left e -> let errsWithSourcePos = fst $ @@ -326,8 +326,14 @@ lexer0' scope rem = endPos = startPos & \(Pos l c) -> Pos l (c + errorLength) in [Token (Err err) startPos endPos] in errsWithSourcePos >>= errorToTokens - Right ts -> Token (Open scope) topLeftCorner topLeftCorner : tweak ts + Right ts -> postLex $ Token (Open scope) topLeftCorner topLeftCorner : ts where + eof :: P [Token Lexeme] + eof = P.try do + p <- P.eof >> posP + n <- maybe 0 (const 1) <$> S.gets opening + l <- S.gets layout + pure $ replicate (length l + n) (Token Close p p) errorItemToString :: EP.ErrorItem Char -> String errorItemToString = \case (P.Tokens ts) -> Foldable.toList ts @@ -336,28 +342,31 @@ lexer0' scope rem = customErrs es = [Err <$> e | P.ErrorCustom e <- toList es] toPos (P.SourcePos _ line col) = Pos (P.unPos line) (P.unPos col) env0 = ParsingEnv [] (Just scope) True [0] 0 - -- hacky postprocessing pass to do some cleanup of stuff that's annoying to - -- fix without adding more state to the lexer: - -- - 1+1 lexes as [1, +1], convert this to [1, +, 1] - -- - when a semi followed by a virtual semi, drop the virtual, lets you - -- write - -- foo x = action1; - -- 2 - -- - semi immediately after first Open is ignored - tweak [] = [] - tweak (h@(payload -> Semi False) : (payload -> Semi True) : t) = h : tweak t - tweak (h@(payload -> Reserved _) : t) = h : tweak t - tweak (t1 : t2@(payload -> Numeric num) : rem) - | notLayout t1 && touches t1 t2 && isSigned num = - t1 - : Token - (SymbolyId (HQ'.fromName (Name.unsafeParseText (Text.pack (take 1 num))))) - (start t2) - (inc $ start t2) - : Token (Numeric (drop 1 num)) (inc $ start t2) (end t2) - : tweak rem - tweak (h : t) = h : tweak t + +-- | hacky postprocessing pass to do some cleanup of stuff that's annoying to +-- fix without adding more state to the lexer: +-- - 1+1 lexes as [1, +1], convert this to [1, +, 1] +-- - when a semi followed by a virtual semi, drop the virtual, lets you +-- write +-- foo x = action1; +-- 2 +-- - semi immediately after first Open is ignored +tweak :: (Token Lexeme) -> [Token Lexeme] -> [Token Lexeme] +tweak h@(Token (Semi False) _ _) (Token (Semi True) _ _ : t) = h : t +-- __NB__: This case only exists to guard against the following one +tweak h@(Token (Reserved _) _ _) t = h : t +tweak t1 (t2@(Token (Numeric num) _ _) : rem) + | notLayout t1 && touches t1 t2 && isSigned num = + t1 + : Token + (SymbolyId (HQ'.fromName (Name.unsafeParseText (Text.pack (take 1 num))))) + (start t2) + (inc $ start t2) + : Token (Numeric (drop 1 num)) (inc $ start t2) (end t2) + : rem + where isSigned num = all (\ch -> ch == '-' || ch == '+') $ take 1 num +tweak h t = h : t formatTrivialError :: Set String -> Set String -> [Char] formatTrivialError unexpectedTokens expectedTokens = @@ -377,7 +386,7 @@ formatTrivialError unexpectedTokens expectedTokens = displayLexeme :: Lexeme -> String displayLexeme = \case Open o -> o - Semi True -> "end of section" + Semi True -> "end of stanza" Semi False -> "semicolon" Close -> "end of section" Reserved r -> "'" <> r <> "'" @@ -397,16 +406,6 @@ infixl 2 <+> (<+>) :: (Monoid a) => P a -> P a -> P a p1 <+> p2 = do a1 <- p1; a2 <- p2; pure (a1 <> a2) -lexemes :: P [Token Lexeme] -lexemes = lexemes' eof - where - eof :: P [Token Lexeme] - eof = P.try do - p <- P.eof >> posP - n <- maybe 0 (const 1) <$> S.gets opening - l <- S.gets layout - pure $ replicate (length l + n) (Token Close p p) - -- Runs the parser `p`, then: -- 1. resets the layout stack to be what it was before `p`. -- 2. emits enough closing tokens to reach `lbl` but not pop it. @@ -998,7 +997,14 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) wrapSimple2 fn a b = ann a <> ann b :< fn a b lexemes' :: P [Token Lexeme] -> P [Token Lexeme] -lexemes' eof = +lexemes' = + -- NB: `postLex` requires the token stream to start with an `Open`, otherwise it can’t create a `T`, so this adds one, + -- runs `postLex`, then removes it. + fmap (tail . postLex . (Token (Open "fake") mempty mempty :)) . lexemes + +-- | Consumes an entire Unison “module”. +lexemes :: P [Token Lexeme] -> P [Token Lexeme] +lexemes eof = P.optional space >> do hd <- join <$> P.manyTill toks (P.lookAhead eof) tl <- eof @@ -1581,8 +1587,11 @@ reorder = foldr fixup [] . join . sortWith f . stanzas preParse :: [Token Lexeme] -> T (Token Lexeme) preParse = reorderTree reorder . tree -lexer :: String -> String -> [Token Lexeme] -lexer scope = toList . preParse . lexer0' scope +-- | A few transformations that happen between lexing and parsing. +-- +-- All of these things should move out of the lexer, and be applied in the parse. +postLex :: [Token Lexeme] -> [Token Lexeme] +postLex = toList . preParse . foldr tweak [] isDelayOrForce :: Char -> Bool isDelayOrForce op = op == '\'' || op == '!' diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 733ecc93cf..498e460f3f 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -17,6 +17,7 @@ module Unison.Syntax.Parser chainr1, character, closeBlock, + optionalCloseBlock, doc, failCommitted, failureIf, @@ -270,6 +271,11 @@ semi = label "newline or semicolon" $ queryToken go closeBlock :: (Ord v) => P v m (L.Token ()) closeBlock = void <$> matchToken L.Close +-- | With layout, blocks might “close” without an explicit outdent (e.g., not even a newline at the end of a +-- `DocTransclude`). This allows those blocks to be closed by EOF. +optionalCloseBlock :: (Ord v) => P v m (L.Token ()) +optionalCloseBlock = closeBlock <|> (\() -> L.Token () mempty mempty) <$> P.eof + wordyPatternName :: (Var v) => P v m (L.Token v) wordyPatternName = queryToken \case L.WordyId (HQ'.NameOnly n) -> Just $ Name.toVar n From 94065e06104b155a66fd8268e6498e0540ba5108 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 24 Jul 2024 19:54:06 -0600 Subject: [PATCH 08/22] Make comments into Haddock --- unison-syntax/src/Unison/Syntax/Lexer.hs | 13 +++--- unison-syntax/src/Unison/Syntax/Parser.hs | 55 ++++++++++++----------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 2ca3dc3738..ce2d63b564 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -90,17 +90,16 @@ type BlockName = String type Layout = [(BlockName, Column)] data ParsingEnv = ParsingEnv - { -- layout stack + { -- | layout stack layout :: !Layout, - -- `Just b` if a block of type `b` is being opened + -- | `Just b` if a block of type `b` is being opened opening :: Maybe BlockName, - -- are we inside a construct that uses layout? + -- | are we inside a construct that uses layout? inLayout :: Bool, - -- Use a stack to remember the parent section and - -- allow docSections within docSections. - -- 1 means we are inside a # Heading 1 + -- | Use a stack to remember the parent section and allow docSections within docSections. + -- - 1 means we are inside a # Heading 1 parentSections :: [Int], - -- 4 means we are inside a list starting at the fourth column + -- | 4 means we are inside a list starting at the fourth column parentListColumn :: Int } deriving (Show) diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 498e460f3f..e12a2a94c4 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -158,19 +158,20 @@ data Error v | UnknownType (L.Token (HQ.HashQualified Name)) (Set Reference) | UnknownId (L.Token (HQ.HashQualified Name)) (Set Referent) (Set Reference) | ExpectedBlockOpen String (L.Token L.Lexeme) - | -- Indicates a cases or match/with which doesn't have any patterns + | -- | Indicates a cases or match/with which doesn't have any patterns EmptyMatch (L.Token ()) | EmptyWatch Ann | UseInvalidPrefixSuffix (Either (L.Token Name) (L.Token Name)) (Maybe [L.Token Name]) | UseEmpty (L.Token String) -- an empty `use` statement | DidntExpectExpression (L.Token L.Lexeme) (Maybe (L.Token L.Lexeme)) | TypeDeclarationErrors [UF.Error v Ann] - | -- MissingTypeModifier (type|ability) name + | -- | MissingTypeModifier (type|ability) name MissingTypeModifier (L.Token String) (L.Token v) | ResolutionFailures [Names.ResolutionFailure v Ann] | DuplicateTypeNames [(v, [Ann])] | DuplicateTermNames [(v, [Ann])] - | PatternArityMismatch Int Int Ann -- PatternArityMismatch expectedArity actualArity location + | -- | PatternArityMismatch expectedArity actualArity location + PatternArityMismatch Int Int Ann | FloatPattern Ann deriving (Show, Eq, Ord) @@ -242,11 +243,11 @@ run' p s name env = run :: (Monad m, Ord v) => P v m a -> String -> ParsingEnv m -> m (Either (Err v) a) run p s = run' p s "" --- Virtual pattern match on a lexeme. +-- | Virtual pattern match on a lexeme. queryToken :: (Ord v) => (L.Lexeme -> Maybe a) -> P v m (L.Token a) queryToken f = P.token (traverse f) Set.empty --- Consume a block opening and return the string that opens the block. +-- | Consume a block opening and return the string that opens the block. openBlock :: (Ord v) => P v m (L.Token String) openBlock = queryToken getOpen where @@ -256,23 +257,23 @@ openBlock = queryToken getOpen openBlockWith :: (Ord v) => String -> P v m (L.Token ()) openBlockWith s = void <$> P.satisfy ((L.Open s ==) . L.payload) --- Match a particular lexeme exactly, and consume it. +-- | Match a particular lexeme exactly, and consume it. matchToken :: (Ord v) => L.Lexeme -> P v m (L.Token L.Lexeme) matchToken x = P.satisfy ((==) x . L.payload) --- Consume a virtual semicolon +-- | Consume a virtual semicolon semi :: (Ord v) => P v m (L.Token ()) semi = label "newline or semicolon" $ queryToken go where go (L.Semi _) = Just () go _ = Nothing --- Consume the end of a block +-- | Consume the end of a block closeBlock :: (Ord v) => P v m (L.Token ()) closeBlock = void <$> matchToken L.Close -- | With layout, blocks might “close” without an explicit outdent (e.g., not even a newline at the end of a --- `DocTransclude`). This allows those blocks to be closed by EOF. +-- `DocTransclude`). This allows those blocks to be closed by EOF. optionalCloseBlock :: (Ord v) => P v m (L.Token ()) optionalCloseBlock = closeBlock <|> (\() -> L.Token () mempty mempty) <$> P.eof @@ -281,13 +282,13 @@ wordyPatternName = queryToken \case L.WordyId (HQ'.NameOnly n) -> Just $ Name.toVar n _ -> Nothing --- Parse an prefix identifier e.g. Foo or (+), discarding any hash +-- | Parse a prefix identifier e.g. Foo or (+), discarding any hash prefixDefinitionName :: (Var v) => P v m (L.Token v) prefixDefinitionName = wordyDefinitionName <|> parenthesize symbolyDefinitionName --- Parse a prefix identifier e.g. Foo or (+), rejecting any hash --- This is useful for term declarations, where type signatures and term names should not have hashes. +-- | Parse a prefix identifier e.g. Foo or (+), rejecting any hash +-- This is useful for term declarations, where type signatures and term names should not have hashes. prefixTermName :: (Var v) => P v m (L.Token v) prefixTermName = wordyTermName <|> parenthesize symbolyTermName where @@ -299,34 +300,34 @@ prefixTermName = wordyTermName <|> parenthesize symbolyTermName L.SymbolyId (HQ'.NameOnly n) -> Just $ Name.toVar n _ -> Nothing --- Parse a wordy identifier e.g. Foo, discarding any hash +-- | Parse a wordy identifier e.g. Foo, discarding any hash wordyDefinitionName :: (Var v) => P v m (L.Token v) wordyDefinitionName = queryToken $ \case L.WordyId n -> Just $ Name.toVar (HQ'.toName n) L.Blank s -> Just $ Var.nameds ("_" <> s) _ -> Nothing --- Parse a wordyId as a Name, rejecting any hash +-- | Parse a wordyId as a Name, rejecting any hash importWordyId :: (Ord v) => P v m (L.Token Name) importWordyId = queryToken \case L.WordyId (HQ'.NameOnly n) -> Just n L.Blank s | not (null s) -> Just $ Name.unsafeParseText (Text.pack ("_" <> s)) _ -> Nothing --- The `+` in: use Foo.bar + as a Name +-- | The `+` in: use Foo.bar + as a Name importSymbolyId :: (Ord v) => P v m (L.Token Name) importSymbolyId = queryToken \case L.SymbolyId (HQ'.NameOnly n) -> Just n _ -> Nothing --- Parse a symboly ID like >>= or &&, discarding any hash +-- | Parse a symboly ID like >>= or &&, discarding any hash symbolyDefinitionName :: (Var v) => P v m (L.Token v) symbolyDefinitionName = queryToken $ \case L.SymbolyId n -> Just $ Name.toVar (HQ'.toName n) _ -> Nothing -- | Expect parentheses around a token, includes the parentheses within the start/end --- annotations of the resulting token. +-- annotations of the resulting token. parenthesize :: (Ord v) => P v m (L.Token a) -> P v m (L.Token a) parenthesize p = do (start, a) <- P.try do @@ -340,7 +341,7 @@ hqPrefixId, hqInfixId :: (Ord v) => P v m (L.Token (HQ.HashQualified Name)) hqPrefixId = hqWordyId_ <|> parenthesize hqSymbolyId_ hqInfixId = hqSymbolyId_ --- Parse a hash-qualified alphanumeric identifier +-- | Parse a hash-qualified alphanumeric identifier hqWordyId_ :: (Ord v) => P v m (L.Token (HQ.HashQualified Name)) hqWordyId_ = queryToken \case L.WordyId n -> Just $ HQ'.toHQ n @@ -348,20 +349,20 @@ hqWordyId_ = queryToken \case L.Blank s | not (null s) -> Just $ HQ.NameOnly (Name.unsafeParseText (Text.pack ("_" <> s))) _ -> Nothing --- Parse a hash-qualified symboly ID like >>=#foo or && +-- | Parse a hash-qualified symboly ID like >>=#foo or && hqSymbolyId_ :: (Ord v) => P v m (L.Token (HQ.HashQualified Name)) hqSymbolyId_ = queryToken \case L.SymbolyId n -> Just (HQ'.toHQ n) _ -> Nothing --- Parse a reserved word +-- | Parse a reserved word reserved :: (Ord v) => String -> P v m (L.Token String) reserved w = label w $ queryToken getReserved where getReserved (L.Reserved w') | w == w' = Just w getReserved _ = Nothing --- Parse a placeholder or typed hole +-- | Parse a placeholder or typed hole blank :: (Ord v) => P v m (L.Token String) blank = label "blank" $ queryToken getBlank where @@ -436,12 +437,12 @@ chainr1 p op = go1 go1 = p >>= go2 go2 hd = do { op <- op; op hd <$> go1 } <|> pure hd --- Parse `p` 1+ times, combining with `op` +-- | Parse `p` 1+ times, combining with `op` chainl1 :: (Ord v) => P v m a -> P v m (a -> a -> a) -> P v m a chainl1 p op = foldl (flip ($)) <$> p <*> P.many (flip <$> op <*> p) --- If `p` would succeed, this fails uncommitted. --- Otherwise, `failIfOk` used to produce the output +-- | If `p` would succeed, this fails uncommitted. +-- Otherwise, `failIfOk` used to produce the output failureIf :: (Ord v) => P v m (P v m b) -> P v m a -> P v m b failureIf failIfOk p = do dontwant <- P.try . P.lookAhead $ failIfOk @@ -449,9 +450,9 @@ failureIf failIfOk p = do when (isJust p) $ fail "failureIf" dontwant --- Gives this var an id based on its position - a useful trick to --- obtain a variable whose id won't match any other id in the file --- `positionalVar a Var.missingResult` +-- | Gives this var an id based on its position - a useful trick to +-- obtain a variable whose id won't match any other id in the file +-- `positionalVar a Var.missingResult` positionalVar :: (Annotated a, Var v) => a -> v -> v positionalVar a v = let s = start (ann a) From 567238fae8dccfe399e935c919e93a363a30a03a Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 22 Jul 2024 16:44:17 -0600 Subject: [PATCH 09/22] Expose `preParse` to the parser --- parser-typechecker/src/Unison/PrintError.hs | 2 +- unison-syntax/src/Unison/Syntax/Lexer.hs | 28 ++++++++------------- unison-syntax/src/Unison/Syntax/Parser.hs | 11 +++----- unison-syntax/test/Main.hs | 4 +-- 4 files changed, 18 insertions(+), 27 deletions(-) diff --git a/parser-typechecker/src/Unison/PrintError.hs b/parser-typechecker/src/Unison/PrintError.hs index 5647ccde63..8b73b179f1 100644 --- a/parser-typechecker/src/Unison/PrintError.hs +++ b/parser-typechecker/src/Unison/PrintError.hs @@ -1336,7 +1336,7 @@ prettyParseError s e = lexerOutput :: Pretty (AnnotatedText a) lexerOutput = if showLexerOutput - then "\nLexer output:\n" <> fromString (L.debugLex' s) + then "\nLexer output:\n" <> fromString (L.debugPreParse' s) else mempty renderParseErrors :: diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index ce2d63b564..dc755e7c79 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -19,11 +19,11 @@ module Unison.Syntax.Lexer DocJoin (..), DocEmbedAnnotation (..), lexer, + preParse, escapeChars, - debugFileLex, - debugLex', - debugLex'', - debugLex''', + debugFilePreParse, + debugPreParse, + debugPreParse', showEscapeChar, touches, @@ -1628,14 +1628,11 @@ typeModifiersAlt f = inc :: Pos -> Pos inc (Pos line col) = Pos line (col + 1) -debugFileLex :: String -> IO () -debugFileLex file = do - contents <- readUtf8 file - let s = debugLex'' (lexer file (Text.unpack contents)) - putStrLn s +debugFilePreParse :: FilePath -> IO () +debugFilePreParse file = putStrLn . debugPreParse . preParse . lexer file . Text.unpack =<< readUtf8 file -debugLex'' :: [Token Lexeme] -> String -debugLex'' [Token (Err (UnexpectedTokens msg)) start end] = +debugPreParse :: T (Token Lexeme) -> String +debugPreParse (L (Token (Err (UnexpectedTokens msg)) start end)) = (if start == end then msg1 else msg2) <> ":\n" <> msg where msg1 = "Error on line " <> show (line start) <> ", column " <> show (column start) @@ -1648,13 +1645,10 @@ debugLex'' [Token (Err (UnexpectedTokens msg)) start end] = <> show (line end) <> ", column " <> show (column end) -debugLex'' ts = show . fmap payload . tree $ ts +debugPreParse ts = show $ payload <$> ts -debugLex' :: String -> String -debugLex' = debugLex'' . lexer "debugLex" - -debugLex''' :: String -> String -> String -debugLex''' s = debugLex'' . lexer s +debugPreParse' :: String -> String +debugPreParse' = debugPreParse . preParse . lexer "debugPreParse" instance EP.ShowErrorComponent (Token Err) where showErrorComponent (Token err _ _) = go err diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index e12a2a94c4..344de0fd1b 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -61,6 +61,7 @@ where import Control.Monad.Reader (ReaderT (..)) import Control.Monad.Reader.Class (asks) import Crypto.Random qualified as Random +import Data.Bool (bool) import Data.Bytes.Put (runPutS) import Data.Bytes.Serial (serialize) import Data.Bytes.VarInt (VarInt (..)) @@ -199,8 +200,7 @@ label = P.label traceRemainingTokens :: (Ord v) => String -> P v m () traceRemainingTokens label = do remainingTokens <- lookAhead $ many anyToken - let _ = - trace ("REMAINDER " ++ label ++ ":\n" ++ L.debugLex'' remainingTokens) () + let _ = trace ("REMAINDER " ++ label ++ ":\n" ++ L.debugPreParse (L.preParse remainingTokens)) () pure () mkAnn :: (Annotated a, Annotated b) => a -> b -> Ann @@ -231,12 +231,9 @@ rootFile p = p <* P.eof run' :: (Monad m, Ord v) => P v m a -> String -> String -> ParsingEnv m -> m (Either (Err v) a) run' p s name env = - let lex = - if debug - then L.lexer name (trace (L.debugLex''' "lexer receives" s) s) - else L.lexer name s + let lex = bool id (traceWith L.debugPreParse) debug . L.preParse $ L.lexer name s pTraced = traceRemainingTokens "parser receives" *> p - in runReaderT (runParserT pTraced name (Input lex)) env <&> \case + in runReaderT (runParserT pTraced name . Input $ toList lex) env <&> \case Left err -> Left (Nel.head (P.bundleErrors err)) Right x -> Right x diff --git a/unison-syntax/test/Main.hs b/unison-syntax/test/Main.hs index bd40c7ded8..5c13940b0a 100644 --- a/unison-syntax/test/Main.hs +++ b/unison-syntax/test/Main.hs @@ -221,8 +221,8 @@ test = t :: String -> [Lexeme] -> Test () t s expected = - let actual0 = payload <$> lexer "ignored filename" s - actual = take (length actual0 - 2) . drop 1 $ actual0 + let actual0 = payload <$> preParse (lexer "ignored filename" s) + actual = take (length actual0 - 2) . drop 1 $ toList actual0 in scope s $ if actual == expected then ok From 6c561f314628c1e8b6e77fa6c7e0118f67265a71 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 22 Jul 2024 16:45:21 -0600 Subject: [PATCH 10/22] Rename `T` to `BlockTree` --- unison-syntax/src/Unison/Syntax/Lexer.hs | 49 +++++++++++++----------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index dc755e7c79..a83d4da38b 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -1520,15 +1520,18 @@ pop = drop 1 topLeftCorner :: Pos topLeftCorner = Pos 1 1 -data T a = T a [T a] [a] | L a deriving (Functor, Foldable, Traversable) - -headToken :: T a -> a -headToken (T a _ _) = a -headToken (L a) = a - -instance (Show a) => Show (T a) where - show (L a) = show a - show (T open mid close) = +data BlockTree a + = Block a [BlockTree a] [a] + | Leaf a + deriving (Functor, Foldable, Traversable) + +headToken :: BlockTree a -> a +headToken (Block a _ _) = a +headToken (Leaf a) = a + +instance (Show a) => Show (BlockTree a) where + show (Leaf a) = show a + show (Block open mid close) = show open ++ "\n" ++ indent " " (intercalateMap "\n" show mid) @@ -1539,26 +1542,26 @@ instance (Show a) => Show (T a) where go by '\n' = '\n' : by go _ c = [c] -reorderTree :: ([T a] -> [T a]) -> T a -> T a -reorderTree _ l@(L _) = l -reorderTree f (T open mid close) = T open (f (reorderTree f <$> mid)) close +reorderTree :: ([BlockTree a] -> [BlockTree a]) -> BlockTree a -> BlockTree a +reorderTree f (Block open mid close) = Block open (f (reorderTree f <$> mid)) close +reorderTree _ l = l -tree :: [Token Lexeme] -> T (Token Lexeme) +tree :: [Token Lexeme] -> BlockTree (Token Lexeme) tree toks = one toks const where - one (open@(payload -> Open _) : ts) k = many (T open) [] ts k - one (t : ts) k = k (L t) ts + one (open@(payload -> Open _) : ts) k = many (Block open) [] ts k + one (t : ts) k = k (Leaf t) ts one [] k = k lastErr [] where - lastErr = case drop (length toks - 1) toks of - [] -> L (Token (Err LayoutError) topLeftCorner topLeftCorner) - (t : _) -> L $ t {payload = Err LayoutError} + lastErr = Leaf case drop (length toks - 1) toks of + [] -> Token (Err LayoutError) topLeftCorner topLeftCorner + (t : _) -> t {payload = Err LayoutError} many open acc [] k = k (open (reverse acc) []) [] many open acc (t@(payload -> Close) : ts) k = k (open (reverse acc) [t]) ts many open acc ts k = one ts $ \t ts -> many open (t : acc) ts k -stanzas :: [T (Token Lexeme)] -> [[T (Token Lexeme)]] +stanzas :: [BlockTree (Token Lexeme)] -> [[BlockTree (Token Lexeme)]] stanzas = go [] where go acc [] = [reverse acc] @@ -1568,7 +1571,7 @@ stanzas = go [] -- Moves type and ability declarations to the front of the token stream -- and move `use` statements to the front of each block -reorder :: [T (Token Lexeme)] -> [T (Token Lexeme)] +reorder :: [BlockTree (Token Lexeme)] -> [BlockTree (Token Lexeme)] reorder = foldr fixup [] . join . sortWith f . stanzas where f [] = 3 :: Int @@ -1583,7 +1586,7 @@ reorder = foldr fixup [] . join . sortWith f . stanzas fixup tok tail = tok : tail -- | This turns the lexeme stream into a tree, reordering some lexeme subsequences. -preParse :: [Token Lexeme] -> T (Token Lexeme) +preParse :: [Token Lexeme] -> BlockTree (Token Lexeme) preParse = reorderTree reorder . tree -- | A few transformations that happen between lexing and parsing. @@ -1631,8 +1634,8 @@ inc (Pos line col) = Pos line (col + 1) debugFilePreParse :: FilePath -> IO () debugFilePreParse file = putStrLn . debugPreParse . preParse . lexer file . Text.unpack =<< readUtf8 file -debugPreParse :: T (Token Lexeme) -> String -debugPreParse (L (Token (Err (UnexpectedTokens msg)) start end)) = +debugPreParse :: BlockTree (Token Lexeme) -> String +debugPreParse (Leaf (Token (Err (UnexpectedTokens msg)) start end)) = (if start == end then msg1 else msg2) <> ":\n" <> msg where msg1 = "Error on line " <> show (line start) <> ", column " <> show (column start) From 3158e666033b124c17ec20ff7f96549548533703 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Tue, 23 Jul 2024 13:59:12 -0600 Subject: [PATCH 11/22] Restructure `BlockTree` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We now build the stanzas at the same time as the tree, and don’t discard them after reordering. This also changes the closing element of `Block` to be `Maybe` instead of `[]`. --- unison-syntax/src/Unison/Syntax/Lexer.hs | 50 +++++++++++++++--------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index a83d4da38b..a356757bf7 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -39,6 +39,7 @@ module Unison.Syntax.Lexer where import Control.Comonad.Cofree (Cofree ((:<))) +import Control.Lens qualified as Lens import Control.Monad.State qualified as S import Data.Char (isAlphaNum, isControl, isDigit, isSpace, ord, toLower) import Data.Foldable qualified as Foldable @@ -1521,7 +1522,13 @@ topLeftCorner :: Pos topLeftCorner = Pos 1 1 data BlockTree a - = Block a [BlockTree a] [a] + = Block + -- | The token that opens the block + a + -- | “Stanzas” of nested tokens + [[BlockTree a]] + -- | The closing token, if any + (Maybe a) | Leaf a deriving (Functor, Foldable, Traversable) @@ -1534,22 +1541,22 @@ instance (Show a) => Show (BlockTree a) where show (Block open mid close) = show open ++ "\n" - ++ indent " " (intercalateMap "\n" show mid) + ++ indent " " (intercalateMap "\n" (intercalateMap " " show) mid) ++ "\n" - ++ intercalateMap "" show close + ++ maybe "" show close where indent by s = by ++ (s >>= go by) go by '\n' = '\n' : by go _ c = [c] -reorderTree :: ([BlockTree a] -> [BlockTree a]) -> BlockTree a -> BlockTree a -reorderTree f (Block open mid close) = Block open (f (reorderTree f <$> mid)) close +reorderTree :: ([[BlockTree a]] -> [[BlockTree a]]) -> BlockTree a -> BlockTree a +reorderTree f (Block open mid close) = Block open (f (fmap (reorderTree f) <$> mid)) close reorderTree _ l = l tree :: [Token Lexeme] -> BlockTree (Token Lexeme) tree toks = one toks const where - one (open@(payload -> Open _) : ts) k = many (Block open) [] ts k + one (open@(payload -> Open _) : ts) k = many (Block open . stanzas) [] ts k one (t : ts) k = k (Leaf t) ts one [] k = k lastErr [] where @@ -1557,22 +1564,24 @@ tree toks = one toks const [] -> Token (Err LayoutError) topLeftCorner topLeftCorner (t : _) -> t {payload = Err LayoutError} - many open acc [] k = k (open (reverse acc) []) [] - many open acc (t@(payload -> Close) : ts) k = k (open (reverse acc) [t]) ts + many open acc [] k = k (open (reverse acc) Nothing) [] + many open acc (t@(payload -> Close) : ts) k = k (open (reverse acc) $ pure t) ts many open acc ts k = one ts $ \t ts -> many open (t : acc) ts k stanzas :: [BlockTree (Token Lexeme)] -> [[BlockTree (Token Lexeme)]] -stanzas = go [] - where - go acc [] = [reverse acc] - go acc (t : ts) = case payload $ headToken t of - Semi _ -> reverse (t : acc) : go [] ts - _ -> go (t : acc) ts +stanzas = + toList + . foldr + ( \tok (curr :| stanzas) -> case tok of + Leaf (Token (Semi _) _ _) -> [tok] :| curr : stanzas + _ -> (tok : curr) :| stanzas + ) + ([] :| []) -- Moves type and ability declarations to the front of the token stream -- and move `use` statements to the front of each block -reorder :: [BlockTree (Token Lexeme)] -> [BlockTree (Token Lexeme)] -reorder = foldr fixup [] . join . sortWith f . stanzas +reorder :: [[BlockTree (Token Lexeme)]] -> [[BlockTree (Token Lexeme)]] +reorder = foldr fixup [] . sortWith f where f [] = 3 :: Int f (t0 : _) = case payload $ headToken t0 of @@ -1582,8 +1591,13 @@ reorder = foldr fixup [] . join . sortWith f . stanzas _ -> 3 :: Int -- after reordering can end up with trailing semicolon at the end of -- a block, which we remove with this pass - fixup (payload . headToken -> Semi _) [] = [] - fixup tok tail = tok : tail + fixup stanza [] = case Lens.unsnoc stanza of + Nothing -> [] + -- remove any trailing `Semi` from the last non-empty stanza + Just (init, Leaf (Token (Semi _) _ _)) -> [init] + -- don’t touch other stanzas + Just (_, _) -> [stanza] + fixup stanza tail = stanza : tail -- | This turns the lexeme stream into a tree, reordering some lexeme subsequences. preParse :: [Token Lexeme] -> BlockTree (Token Lexeme) From a6f6d9c8dc35adabb6052eccb5e341b80083d317 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Wed, 24 Jul 2024 22:59:18 -0600 Subject: [PATCH 12/22] Remove unnecessary `docOpen` in Doc parser --- unison-syntax/src/Unison/Syntax/Lexer.hs | 2 -- 1 file changed, 2 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index a356757bf7..9f2119011a 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -653,7 +653,6 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) let end = P.lookAhead $ void docClose - <|> void docOpen <|> void (P.satisfy isSpace) <|> void closing word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end @@ -768,7 +767,6 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) pure ex docClose = [] <$ docClose' - docOpen = [] <$ lit "{{" link = P.label "link (examples: {type List}, {Nat.+})" $ From c53cb088e1262a2f06a81fc1dc66f685d86cc707 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 25 Jul 2024 10:28:33 -0600 Subject: [PATCH 13/22] Split `Doc` into its own module --- .../src/Unison/Syntax/TermParser.hs | 79 +++--- unison-syntax/src/Unison/Syntax/Lexer.hs | 232 ++++-------------- .../src/Unison/Syntax/Lexer/Token.hs | 4 + unison-syntax/src/Unison/Syntax/Parser.hs | 5 +- .../src/Unison/Syntax/Parser/Doc/Data.hs | 166 +++++++++++++ unison-syntax/unison-syntax.cabal | 1 + 6 files changed, 260 insertions(+), 227 deletions(-) create mode 100644 unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 999d5658ba..89d5504079 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -25,7 +25,7 @@ import Data.Sequence qualified as Sequence import Data.Set qualified as Set import Data.Text qualified as Text import Data.Tuple.Extra qualified as TupleE -import Data.Void (vacuous) +import Data.Void (absurd, vacuous) import Text.Megaparsec qualified as P import U.Core.ABT qualified as ABT import Unison.ABT qualified as ABT @@ -52,6 +52,7 @@ import Unison.Syntax.Name qualified as Name (toText, toVar, unsafeParseVar) import Unison.Syntax.NameSegment qualified as NameSegment import Unison.Syntax.Parser hiding (seq) import Unison.Syntax.Parser qualified as Parser (seq, uniqueName) +import Unison.Syntax.Parser.Doc.Data qualified as Doc import Unison.Syntax.TypeParser qualified as TypeParser import Unison.Term (IsTop, Term) import Unison.Term qualified as Term @@ -525,43 +526,43 @@ doc2Block = do f :: (Annotated a) => a -> String -> Term v Ann f a = Term.var (gann a) . Var.nameds . ("syntax.doc" <>) - docUntitledSection :: Ann -> L.DocUntitledSection (Term v Ann) -> Term v Ann - docUntitledSection ann (L.DocUntitledSection tops) = + docUntitledSection :: Ann -> Doc.UntitledSection (Term v Ann) -> Term v Ann + docUntitledSection ann (Doc.UntitledSection tops) = Term.app ann (f ann "UntitledSection") $ Term.list (gann tops) tops - docTop :: L.DocTop (Term v Ann) -> TermP v m + docTop :: Doc.Top [L.Token L.Lexeme] (Term v Ann) -> TermP v m docTop d = case d of - L.DocSection title body -> pure $ Term.apps' (f d "Section") [title, Term.list (gann body) body] - L.DocEval code -> + Doc.Section title body -> pure $ Term.apps' (f d "Section") [title, Term.list (gann body) body] + Doc.Eval code -> Term.app (gann d) (f d "Eval") . addDelay . snd <$> subParse (block' False False "syntax.docEval" (pure $ pure ()) $ Ann.External <$ P.eof) code - L.DocExampleBlock code -> + Doc.ExampleBlock code -> Term.apps' (f d "ExampleBlock") . (Term.nat (gann d) 0 :) . pure . addDelay . snd <$> subParse (block' False True "syntax.docExampleBlock" (pure $ pure ()) $ Ann.External <$ P.eof) code - L.DocCodeBlock label body -> + Doc.CodeBlock label body -> pure $ Term.apps' (f d "CodeBlock") [Term.text (ann label) . Text.pack $ L.payload label, Term.text (ann body) . Text.pack $ L.payload body] - L.DocBulletedList items -> + Doc.BulletedList items -> pure $ Term.app (gann d) (f d "BulletedList") . Term.list (gann items) . toList $ docColumn <$> items - L.DocNumberedList items@((n, _) :| _) -> + Doc.NumberedList items@((n, _) :| _) -> pure $ Term.apps' (f d "NumberedList") [Term.nat (ann d) $ L.payload n, Term.list (gann $ snd <$> items) . toList $ docColumn . snd <$> items] - L.DocParagraph leaves -> + Doc.Paragraph leaves -> Term.app (gann d) (f d "Paragraph") . Term.list (ann leaves) . toList <$> traverse docLeaf leaves - docColumn :: L.DocColumn (Term v Ann) -> Term v Ann - docColumn d@(L.DocColumn para sublist) = + docColumn :: Doc.Column (Term v Ann) -> Term v Ann + docColumn d@(Doc.Column para sublist) = Term.app (gann d) (f d "Column") . Term.list (gann d) $ para : toList sublist - docLeaf :: L.DocLeaf (Term v Ann) -> TermP v m + docLeaf :: Doc.Leaf [L.Token L.Lexeme] (Term v Ann) -> TermP v m docLeaf d = case d of - L.DocLink link -> Term.app (gann d) (f d "Link") <$> docEmbedLink link - L.DocNamedLink para target -> Term.apps' (f d "NamedLink") . (para :) . pure <$> docLeaf (vacuous target) - L.DocExample code -> do + Doc.Link link -> Term.app (gann d) (f d "Link") <$> docEmbedLink link + Doc.NamedLink para target -> Term.apps' (f d "NamedLink") . (para :) . pure <$> docLeaf (vacuous target) + Doc.Example code -> do trm <- subParse term code pure . Term.apps' (f d "Example") $ case trm of tm@(Term.Apps' _ xs) -> @@ -570,45 +571,45 @@ doc2Block = do lam = addDelay $ Term.lam' (ann tm) ((mempty,) <$> fvs) tm in [n, lam] tm -> [Term.nat (ann tm) 0, addDelay tm] - L.DocTransclude code -> Term.app (gann d) (f d "Transclude") <$> subParse term code - L.DocBold para -> pure $ Term.app (gann d) (f d "Bold") para - L.DocItalic para -> pure $ Term.app (gann d) (f d "Italic") para - L.DocStrikethrough para -> pure $ Term.app (gann d) (f d "Strikethrough") para - L.DocVerbatim leaf -> Term.app (gann d) (f d "Verbatim") <$> docLeaf (vacuous leaf) - L.DocCode leaf -> Term.app (gann d) (f d "Code") <$> docLeaf (vacuous leaf) - L.DocSource elems -> + Doc.Transclude code -> Term.app (gann d) (f d "Transclude") <$> subParse term code + Doc.Bold para -> pure $ Term.app (gann d) (f d "Bold") para + Doc.Italic para -> pure $ Term.app (gann d) (f d "Italic") para + Doc.Strikethrough para -> pure $ Term.app (gann d) (f d "Strikethrough") para + Doc.Verbatim leaf -> Term.app (gann d) (f d "Verbatim") <$> docLeaf (bimap absurd absurd leaf) + Doc.Code leaf -> Term.app (gann d) (f d "Code") <$> docLeaf (bimap absurd absurd leaf) + Doc.Source elems -> Term.app (gann d) (f d "Source") . Term.list (ann elems) . toList <$> traverse docSourceElement elems - L.DocFoldedSource elems -> + Doc.FoldedSource elems -> Term.app (gann d) (f d "FoldedSource") . Term.list (ann elems) . toList <$> traverse docSourceElement elems - L.DocEvalInline code -> Term.app (gann d) (f d "EvalInline") . addDelay <$> subParse term code - L.DocSignature links -> + Doc.EvalInline code -> Term.app (gann d) (f d "EvalInline") . addDelay <$> subParse term code + Doc.Signature links -> Term.app (gann d) (f d "Signature") . Term.list (ann links) . toList <$> traverse docEmbedSignatureLink links - L.DocSignatureInline link -> Term.app (gann d) (f d "SignatureInline") <$> docEmbedSignatureLink link - L.DocWord txt -> pure . Term.app (gann d) (f d "Word") . Term.text (ann txt) . Text.pack $ L.payload txt - L.DocGroup (L.DocJoin leaves) -> + Doc.SignatureInline link -> Term.app (gann d) (f d "SignatureInline") <$> docEmbedSignatureLink link + Doc.Word txt -> pure . Term.app (gann d) (f d "Word") . Term.text (ann txt) . Text.pack $ L.payload txt + Doc.Group (Doc.Join leaves) -> Term.app (gann d) (f d "Group") . Term.app (gann d) (f d "Join") . Term.list (ann leaves) . toList <$> traverse docLeaf leaves - docEmbedLink :: L.DocEmbedLink -> TermP v m + docEmbedLink :: Doc.EmbedLink -> TermP v m docEmbedLink d = case d of - L.DocEmbedTypeLink ident -> + Doc.EmbedTypeLink ident -> Term.app (gann d) (f d "EmbedTypeLink") . Term.typeLink (ann d) . L.payload <$> findUniqueType (HQ'.toHQ <$> ident) - L.DocEmbedTermLink ident -> + Doc.EmbedTermLink ident -> Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docSourceElement :: L.DocSourceElement -> TermP v m - docSourceElement d@(L.DocSourceElement link anns) = do + docSourceElement :: Doc.SourceElement [L.Token L.Lexeme] -> TermP v m + docSourceElement d@(Doc.SourceElement link anns) = do link' <- docEmbedLink link anns' <- traverse docEmbedAnnotation anns pure $ Term.apps' (f d "SourceElement") [link', Term.list (ann anns) anns'] - docEmbedSignatureLink :: L.DocEmbedSignatureLink -> TermP v m - docEmbedSignatureLink d@(L.DocEmbedSignatureLink ident) = + docEmbedSignatureLink :: Doc.EmbedSignatureLink -> TermP v m + docEmbedSignatureLink d@(Doc.EmbedSignatureLink ident) = Term.app (gann d) (f d "EmbedSignatureLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docEmbedAnnotation :: L.DocEmbedAnnotation -> TermP v m - docEmbedAnnotation d@(L.DocEmbedAnnotation a) = + docEmbedAnnotation :: Doc.EmbedAnnotation [L.Token L.Lexeme] -> TermP v m + docEmbedAnnotation d@(Doc.EmbedAnnotation a) = -- This is the only place I’m not sure we’re doing the right thing. In the lexer, this can be an identifier or a -- DocLeaf, but here it could be either /text/ or a Doc element. And I don’t think there’s any way the lexemes -- produced for an identifier and the lexemes consumed for text line up. So, I think this is a bugfix I can’t diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index 9f2119011a..c0d1c3c04c 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -9,15 +9,6 @@ module Unison.Syntax.Lexer Pos (..), Lexeme (..), DocTree, - DocUntitledSection (..), - DocTop (..), - DocColumn (..), - DocLeaf (..), - DocEmbedLink (..), - DocSourceElement (..), - DocEmbedSignatureLink (..), - DocJoin (..), - DocEmbedAnnotation (..), lexer, preParse, escapeChars, @@ -43,7 +34,6 @@ import Control.Lens qualified as Lens import Control.Monad.State qualified as S import Data.Char (isAlphaNum, isControl, isDigit, isSpace, ord, toLower) import Data.Foldable qualified as Foldable -import Data.Functor.Classes import Data.List qualified as List import Data.List.Extra qualified as List import Data.List.NonEmpty (NonEmpty ((:|))) @@ -66,7 +56,7 @@ import Unison.Name qualified as Name import Unison.NameSegment (NameSegment) import Unison.NameSegment qualified as NameSegment (docSegment) import Unison.NameSegment.Internal qualified as NameSegment -import Unison.Parser.Ann (Ann (Ann), Annotated (..)) +import Unison.Parser.Ann (Ann, Annotated (..)) import Unison.Prelude import Unison.ShortHash (ShortHash) import Unison.ShortHash qualified as SH @@ -75,6 +65,7 @@ import Unison.Syntax.Lexer.Token (Token (..), posP, tokenP) import Unison.Syntax.Name qualified as Name (isSymboly, nameP, toText, unsafeParseText) import Unison.Syntax.NameSegment (symbolyIdChar, wordyIdChar, wordyIdStartChar) import Unison.Syntax.NameSegment qualified as NameSegment (ParseErr (..), wordyP) +import Unison.Syntax.Parser.Doc.Data qualified as Doc import Unison.Syntax.ReservedWords (delimiters, typeModifiers, typeOrAbility) import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) import Unison.Util.Bytes qualified as Bytes @@ -83,9 +74,6 @@ import Unison.Util.Monoid (intercalateMap) instance (Annotated a) => Annotated (Cofree f a) where ann (a :< _) = ann a -instance Annotated (Token a) where - ann (Token _ s e) = Ann s e - type BlockName = String type Layout = [(BlockName, Column)] @@ -158,7 +146,7 @@ data Lexeme | Bytes Bytes.Bytes -- bytes literals | Hash ShortHash -- hash literals | Err Err - | Doc (DocUntitledSection DocTree) + | Doc (Doc.UntitledSection DocTree) deriving stock (Eq, Show, Ord) type IsVirtual = Bool -- is it a virtual semi or an actual semi? @@ -422,6 +410,8 @@ restoreStack lbl p = do S.put (s2 {layout = layout1}) pure $ p <> closes +type DocTree = Cofree (Doc.Top [Token Lexeme]) Ann + -- | The `Doc` lexer as documented on unison-lang.org doc2 :: P [Token Lexeme] doc2 = do @@ -508,148 +498,18 @@ someTill' p end = liftA2 (:|) p $ P.manyTill p end sepBy1' :: P a -> P sep -> P (NonEmpty a) sepBy1' p sep = liftA2 (:|) p . many $ sep *> p -newtype DocUntitledSection a = DocUntitledSection [a] - deriving (Eq, Ord, Show, Foldable, Functor, Traversable) - --- | Haskell parallel to @unison/base.Doc@. --- --- This is much more restricted than @unison/base.Doc@, but it covers everything we can parse from Haskell. The --- mismatch with Unison is a problem, as someone can create a Unison Doc with explicit constructors or function calls, --- have it rendered to a scratch file, and then we can’t parse it. Changing the types here to match Unison wouldn’t --- fix the issue. We have to modify the types and parser in concert (in both Haskell and Unison) to bring them in --- line. --- --- __NB__: Uses of @[`Token` `Lexeme`]@ here indicate a nested transition to the Unison lexer. -data DocTop a - = -- | The first argument is always a Paragraph - DocSection a [a] - | DocEval [Token Lexeme] - | DocExampleBlock [Token Lexeme] - | DocCodeBlock (Token String) (Token String) - | DocBulletedList (NonEmpty (DocColumn a)) - | DocNumberedList (NonEmpty (Token Word64, DocColumn a)) - | DocParagraph (NonEmpty (DocLeaf a)) - deriving (Eq, Ord, Show, Foldable, Functor, Traversable) - -instance Eq1 DocTop where - liftEq _ _ _ = True - -instance Ord1 DocTop where - liftCompare _ _ _ = LT - -instance Show1 DocTop where - liftShowsPrec _ _ _ _ x = x - -data DocColumn a - = -- | The first is always a Paragraph, and the second a Bulleted or Numbered List - DocColumn a (Maybe a) - deriving (Eq, Ord, Show, Foldable, Functor, Traversable) - -data DocLeaf a - = DocLink DocEmbedLink - | -- | first is a Paragraph, second is always a Group (which contains either a single Term/Type link or list of - -- Transcludes & Words) - DocNamedLink a (DocLeaf Void) - | DocExample [Token Lexeme] - | DocTransclude [Token Lexeme] - | -- | Always a Paragraph - DocBold a - | -- | Always a Paragraph - DocItalic a - | -- | Always a Paragraph - DocStrikethrough a - | -- | Always a Word - DocVerbatim (DocLeaf Void) - | -- | Always a Word - DocCode (DocLeaf Void) - | DocSource (NonEmpty DocSourceElement) - | DocFoldedSource (NonEmpty DocSourceElement) - | DocEvalInline [Token Lexeme] - | DocSignature (NonEmpty DocEmbedSignatureLink) - | DocSignatureInline DocEmbedSignatureLink - | DocWord (Token String) - | DocGroup (DocJoin a) - deriving (Eq, Ord, Show, Foldable, Functor, Traversable) - -data DocEmbedLink - = DocEmbedTypeLink (Token (HQ'.HashQualified Name)) - | DocEmbedTermLink (Token (HQ'.HashQualified Name)) - deriving (Eq, Ord, Show) - -data DocSourceElement = DocSourceElement DocEmbedLink [DocEmbedAnnotation] - deriving (Eq, Ord, Show) - -newtype DocEmbedSignatureLink = DocEmbedSignatureLink (Token (HQ'.HashQualified Name)) - deriving (Eq, Ord, Show) - -newtype DocJoin a = DocJoin (NonEmpty (DocLeaf a)) - deriving (Eq, Ord, Show, Foldable, Functor, Traversable) - -newtype DocEmbedAnnotation - = -- | Always a DocTransclude - DocEmbedAnnotation (Either (Token (HQ'.HashQualified Name)) (DocLeaf Void)) - deriving (Eq, Ord, Show) - -type DocTree = Cofree DocTop Ann - -instance (Annotated a) => Annotated (DocTop a) where - ann = \case - DocSection title body -> ann title <> ann body - DocEval code -> ann code - DocExampleBlock code -> ann code - DocCodeBlock label body -> ann label <> ann body - DocBulletedList items -> ann items - DocNumberedList items -> ann $ snd <$> items - DocParagraph leaves -> ann leaves - -instance (Annotated a) => Annotated (DocColumn a) where - ann (DocColumn para list) = ann para <> ann list - -instance (Annotated a) => Annotated (DocLeaf a) where - ann = \case - DocLink link -> ann link - DocNamedLink label target -> ann label <> ann target - DocExample code -> ann code - DocTransclude code -> ann code - DocBold para -> ann para - DocItalic para -> ann para - DocStrikethrough para -> ann para - DocVerbatim word -> ann word - DocCode word -> ann word - DocSource elems -> ann elems - DocFoldedSource elems -> ann elems - DocEvalInline code -> ann code - DocSignature links -> ann links - DocSignatureInline link -> ann link - DocWord text -> ann text - DocGroup (DocJoin leaves) -> ann leaves - -instance Annotated DocEmbedLink where - ann = \case - DocEmbedTypeLink name -> ann name - DocEmbedTermLink name -> ann name - -instance Annotated DocSourceElement where - ann (DocSourceElement link target) = ann link <> ann target - -instance Annotated DocEmbedSignatureLink where - ann (DocEmbedSignatureLink name) = ann name - -instance Annotated DocEmbedAnnotation where - ann (DocEmbedAnnotation a) = either ann ann a - -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). -docBody :: P end -> P (DocUntitledSection DocTree) -docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) +docBody :: P end -> P (Doc.UntitledSection DocTree) +docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) where wordyKw kw = separated wordySep (lit kw) sectionElem = section <|> fencedBlock <|> list <|> paragraph - paragraph = wrap' . DocParagraph <$> spaced leaf + paragraph = wrap' . Doc.Paragraph <$> spaced leaf reserved word = List.isPrefixOf "}}" word || all (== '#') word - wordy :: P end -> P (DocLeaf void) - wordy closing = fmap DocWord . tokenP . P.try $ do + wordy :: P end -> P (Doc.Leaf [Token Lexeme] void) + wordy closing = fmap Doc.Word . tokenP . P.try $ do let end = P.lookAhead $ void docClose @@ -677,10 +537,10 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) where comma = lit "," <* CP.space src = - src' DocSource "@source" - <|> src' DocFoldedSource "@foldedSource" + src' Doc.Source "@source" + <|> src' Doc.FoldedSource "@foldedSource" srcElem = - DocSourceElement + Doc.SourceElement <$> (typeLink <|> termLink) <*> ( fmap (fromMaybe []) . P.optional $ (lit "@") *> (CP.space *> annotations) @@ -688,38 +548,38 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) where annotation = fmap Left (tokenP identifierP) <|> fmap Right expr <* CP.space annotations = - P.some (DocEmbedAnnotation <$> annotation) + P.some (Doc.EmbedAnnotation <$> annotation) src' name atName = fmap name $ do _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space s <- sepBy1' srcElem comma _ <- lit "}" pure s - signature = fmap DocSignature $ do + signature = fmap Doc.Signature $ do _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space s <- sepBy1' signatureLink comma _ <- lit "}" pure s - signatureInline = fmap DocSignatureInline $ do + signatureInline = fmap Doc.SignatureInline $ do _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space s <- signatureLink _ <- lit "}" pure s - evalInline = fmap DocEvalInline $ do + evalInline = fmap Doc.EvalInline $ do _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space let inlineEvalClose = [] <$ lit "}" s <- lexemes' inlineEvalClose pure s - typeLink = fmap DocEmbedTypeLink $ do + typeLink = fmap Doc.EmbedTypeLink $ do _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space tokenP identifierP <* CP.space termLink = - fmap DocEmbedTermLink $ + fmap Doc.EmbedTermLink $ tokenP identifierP <* CP.space signatureLink = - fmap DocEmbedSignatureLink $ + fmap Doc.EmbedSignatureLink $ tokenP identifierP <* CP.space groupy closing p = do @@ -728,8 +588,8 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) pure $ case after of Nothing -> p Just after -> - DocGroup - . DocJoin + Doc.Group + . Doc.Join $ p :| pure after @@ -748,17 +608,17 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed -- If it's a multi-line verbatim block we trim any whitespace representing -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - pure . DocVerbatim $ - DocWord $ + pure . Doc.Verbatim $ + Doc.Word $ Token txt start stop else - pure . DocCode $ - DocWord $ + pure . Doc.Code $ + Doc.Word $ Token originalText start stop exampleInline = P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ - fmap DocExample $ do + fmap Doc.Example $ do n <- P.try $ do _ <- lit "`" length <$> P.takeWhile1P (Just "backticks") (== '`') @@ -770,12 +630,12 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) link = P.label "link (examples: {type List}, {Nat.+})" $ - fmap DocLink $ + fmap Doc.Link $ P.try $ lit "{" *> (typeLink <|> termLink) <* lit "}" expr = - fmap DocTransclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ + fmap Doc.Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ openAs "{{" "syntax.docTransclude" *> do env0 <- S.get @@ -806,7 +666,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) P.label "block eval (syntax: a fenced code block)" $ evalUnison <|> exampleBlock <|> other where - evalUnison = fmap (wrap' . DocEval) $ do + evalUnison = fmap (wrap' . Doc.Eval) $ do -- commit after seeing that ``` is on its own line fence <- P.try $ do fence <- lit "```" <+> P.takeWhileP Nothing (== '`') @@ -817,7 +677,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) (\env -> env {inLayout = True, opening = Just "docEval"}) (restoreStack "docEval" $ lexemes' ([] <$ lit fence)) - exampleBlock = fmap (wrap' . DocExampleBlock) $ do + exampleBlock = fmap (wrap' . Doc.ExampleBlock) $ do void $ lit "@typecheck" <* CP.space fence <- lit "```" <+> P.takeWhileP Nothing (== '`') local @@ -834,7 +694,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) skip _ s = s in List.intercalate "\n" $ skip column <$> lines s - other = fmap (uncurry $ wrapSimple2 DocCodeBlock) $ do + other = fmap (uncurry $ wrapSimple2 Doc.CodeBlock) $ do column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel let tabWidth = toInteger . P.unPos $ P.defaultTabWidth fence <- lit "```" <+> P.takeWhileP Nothing (== '`') @@ -857,26 +717,26 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) (P.satisfy (== '~')) name s = if take 1 s == "~" - then DocStrikethrough - else if take 1 s == "*" then DocBold else DocItalic + then Doc.Strikethrough + else if take 1 s == "*" then Doc.Bold else Doc.Italic end <- P.try $ do end <- start P.lookAhead (P.satisfy (not . isSpace)) pure end - name end . wrap' . DocParagraph + name end . wrap' . Doc.Paragraph <$> someTill' (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) (lit end) externalLink = P.label "hyperlink (example: [link name](https://destination.com))" $ - fmap (uncurry DocNamedLink) $ do + fmap (uncurry Doc.NamedLink) $ do _ <- lit "[" p <- leafies (void $ char ']') _ <- lit "]" _ <- lit "(" target <- - fmap (DocGroup . DocJoin) $ + fmap (Doc.Group . Doc.Join) $ fmap pure link <|> some' (expr <|> wordy (char ')')) _ <- lit ")" pure (p, target) @@ -894,12 +754,12 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) ok s = length [() | '\n' <- s] < 2 spaced p = some' (p <* P.optional sp) - leafies close = wrap' . DocParagraph <$> spaced (leafy close) + leafies close = wrap' . Doc.Paragraph <$> spaced (leafy close) list = bulletedList <|> numberedList - bulletedList = wrap' . DocBulletedList <$> sepBy1' bullet listSep - numberedList = wrap' . DocNumberedList <$> sepBy1' numberedItem listSep + bulletedList = wrap' . Doc.BulletedList <$> sepBy1' bullet listSep + numberedList = wrap' . Doc.NumberedList <$> sepBy1' numberedItem listSep listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) @@ -921,7 +781,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) numberedStart = listItemStart' $ P.try (tokenP $ LP.decimal <* lit ".") - listItemParagraph = fmap (wrap' . DocParagraph) $ do + listItemParagraph = fmap (wrap' . Doc.Paragraph) $ do col <- column <$> posP some' (leaf <* sep col) where @@ -947,7 +807,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) numberedItem = P.label msg $ do (col, s) <- numberedStart (s,) - <$> ( fmap (uncurry DocColumn) $ do + <$> ( fmap (uncurry Doc.Column) $ do p <- nonNewlineSpaces *> listItemParagraph subList <- local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) @@ -956,7 +816,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) where msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" - bullet = fmap (uncurry DocColumn) . P.label "bullet (examples: * item1, - item2)" $ do + bullet = fmap (uncurry Doc.Column) . P.label "bullet (examples: * item1, - item2)" $ do (col, _) <- bulletedStart p <- nonNewlineSpaces *> listItemParagraph subList <- @@ -978,7 +838,7 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) -- # A section title (not a subsection) section :: P DocTree - section = fmap (wrap' . uncurry DocSection) $ do + section = fmap (wrap' . uncurry Doc.Section) $ do ns <- S.gets parentSections hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp title <- paragraph <* CP.space @@ -988,10 +848,10 @@ docBody docClose' = DocUntitledSection <$> P.many (sectionElem <* CP.space) P.many (sectionElem <* CP.space) pure $ (title, body) - wrap' :: DocTop DocTree -> DocTree + wrap' :: Doc.Top [Token Lexeme] DocTree -> DocTree wrap' doc = ann doc :< doc - wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> DocTop DocTree) -> a -> b -> DocTree + wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> Doc.Top [Token Lexeme] DocTree) -> a -> b -> DocTree wrapSimple2 fn a b = ann a <> ann b :< fn a b lexemes' :: P [Token Lexeme] -> P [Token Lexeme] diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Token.hs b/unison-syntax/src/Unison/Syntax/Lexer/Token.hs index 81842c409e..e29f276c5e 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Token.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Token.hs @@ -9,6 +9,7 @@ import Data.Text qualified as Text import Text.Megaparsec (ParsecT, TraversableStream) import Text.Megaparsec qualified as P import Unison.Lexer.Pos (Pos (Pos)) +import Unison.Parser.Ann (Ann (Ann), Annotated (..)) import Unison.Prelude data Token a = Token @@ -18,6 +19,9 @@ data Token a = Token } deriving stock (Eq, Ord, Show, Functor, Foldable, Traversable) +instance Annotated (Token a) where + ann (Token _ s e) = Ann s e + instance Applicative Token where pure a = Token a (Pos 0 0) (Pos 0 0) Token f start _ <*> Token a _ end = Token (f a) start end diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 344de0fd1b..1bee4d08f4 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -90,6 +90,7 @@ import Unison.Reference (Reference) import Unison.Referent (Referent) import Unison.Syntax.Lexer qualified as L import Unison.Syntax.Name qualified as Name (toVar, unsafeParseText) +import Unison.Syntax.Parser.Doc.Data qualified as Doc import Unison.Term (MatchCase (..)) import Unison.UnisonFile.Error qualified as UF import Unison.Util.Bytes (Bytes) @@ -270,7 +271,7 @@ closeBlock :: (Ord v) => P v m (L.Token ()) closeBlock = void <$> matchToken L.Close -- | With layout, blocks might “close” without an explicit outdent (e.g., not even a newline at the end of a --- `DocTransclude`). This allows those blocks to be closed by EOF. +-- `Doc.Transclude`). This allows those blocks to be closed by EOF. optionalCloseBlock :: (Ord v) => P v m (L.Token ()) optionalCloseBlock = closeBlock <|> (\() -> L.Token () mempty mempty) <$> P.eof @@ -399,7 +400,7 @@ string = queryToken getString getString (L.Textual s) = Just (Text.pack s) getString _ = Nothing -doc :: (Ord v) => P v m (L.Token (L.DocUntitledSection L.DocTree)) +doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection L.DocTree)) doc = queryToken \case L.Doc d -> pure d _ -> Nothing diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs new file mode 100644 index 0000000000..4a88200b8b --- /dev/null +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs @@ -0,0 +1,166 @@ +-- | Haskell parallel to @unison/base.Doc@. +-- +-- This is much more restricted than @unison/base.Doc@, but it covers everything we can parse from Haskell. The +-- mismatch with Unison is a problem, as someone can create a Unison Doc with explicit constructors or function calls, +-- have it rendered to a scratch file, and then we can’t parse it. Changing the types here to match Unison wouldn’t +-- fix the issue. We have to modify the types and parser in concert (in both Haskell and Unison) to bring them in +-- line. +module Unison.Syntax.Parser.Doc.Data where + +import Data.Functor.Classes +import Data.List.NonEmpty (NonEmpty) +import Unison.HashQualifiedPrime qualified as HQ' +import Unison.Name (Name) +import Unison.Parser.Ann (Annotated (..)) +import Unison.Prelude +import Unison.Syntax.Lexer.Token (Token (..)) + +newtype UntitledSection a = UntitledSection [a] + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +data Top code a + = -- | The first argument is always a Paragraph + Section a [a] + | Eval code + | ExampleBlock code + | CodeBlock (Token String) (Token String) + | BulletedList (NonEmpty (Column a)) + | NumberedList (NonEmpty (Token Word64, Column a)) + | Paragraph (NonEmpty (Leaf code a)) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +instance Eq2 Top where + liftEq2 _ _ _ _ = True + +instance (Eq code) => Eq1 (Top code) + +instance Ord2 Top where + liftCompare2 _ _ _ _ = LT + +instance (Ord code) => Ord1 (Top code) + +instance Show2 Top where + liftShowsPrec2 _ _ _ _ _ _ x = x + +instance (Show code) => Show1 (Top code) + +data Column a + = -- | The first is always a Paragraph, and the second a Bulleted or Numbered List + Column a (Maybe a) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +data Leaf code a + = Link EmbedLink + | -- | first is a Paragraph, second is always a Group (which contains either a single Term/Type link or list of + -- Transcludes & Words) + NamedLink a (Leaf code Void) + | Example code + | Transclude code + | -- | Always a Paragraph + Bold a + | -- | Always a Paragraph + Italic a + | -- | Always a Paragraph + Strikethrough a + | -- | Always a Word + Verbatim (Leaf Void Void) + | -- | Always a Word + Code (Leaf Void Void) + | Source (NonEmpty (SourceElement code)) + | FoldedSource (NonEmpty (SourceElement code)) + | EvalInline code + | Signature (NonEmpty EmbedSignatureLink) + | SignatureInline EmbedSignatureLink + | Word (Token String) + | Group (Join code a) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +instance Bifunctor Leaf where + bimap f g = \case + Link x -> Link x + NamedLink a leaf -> NamedLink (g a) $ first f leaf + Example code -> Example $ f code + Transclude code -> Transclude $ f code + Bold a -> Bold $ g a + Italic a -> Italic $ g a + Strikethrough a -> Strikethrough $ g a + Verbatim leaf -> Verbatim leaf + Code leaf -> Code leaf + Source elems -> Source $ fmap f <$> elems + FoldedSource elems -> FoldedSource $ fmap f <$> elems + EvalInline code -> EvalInline $ f code + Signature x -> Signature x + SignatureInline x -> SignatureInline x + Word x -> Word x + Group join -> Group $ bimap f g join + +data EmbedLink + = EmbedTypeLink (Token (HQ'.HashQualified Name)) + | EmbedTermLink (Token (HQ'.HashQualified Name)) + deriving (Eq, Ord, Show) + +data SourceElement code = SourceElement EmbedLink [EmbedAnnotation code] + deriving (Eq, Ord, Show, Functor) + +newtype EmbedSignatureLink = EmbedSignatureLink (Token (HQ'.HashQualified Name)) + deriving (Eq, Ord, Show) + +newtype Join code a = Join (NonEmpty (Leaf code a)) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) + +instance Bifunctor Join where + bimap f g (Join leaves) = Join $ bimap f g <$> leaves + +newtype EmbedAnnotation code + = -- | Always a Transclude + EmbedAnnotation (Either (Token (HQ'.HashQualified Name)) (Leaf code Void)) + deriving (Eq, Ord, Show) + +instance Functor EmbedAnnotation where + fmap f (EmbedAnnotation ann) = EmbedAnnotation $ first f <$> ann + +instance (Annotated code, Annotated a) => Annotated (Top code a) where + ann = \case + Section title body -> ann title <> ann body + Eval code -> ann code + ExampleBlock code -> ann code + CodeBlock label body -> ann label <> ann body + BulletedList items -> ann items + NumberedList items -> ann $ snd <$> items + Paragraph leaves -> ann leaves + +instance (Annotated a) => Annotated (Column a) where + ann (Column para list) = ann para <> ann list + +instance (Annotated code, Annotated a) => Annotated (Leaf code a) where + ann = \case + Link link -> ann link + NamedLink label target -> ann label <> ann target + Example code -> ann code + Transclude code -> ann code + Bold para -> ann para + Italic para -> ann para + Strikethrough para -> ann para + Verbatim word -> ann word + Code word -> ann word + Source elems -> ann elems + FoldedSource elems -> ann elems + EvalInline code -> ann code + Signature links -> ann links + SignatureInline link -> ann link + Word text -> ann text + Group (Join leaves) -> ann leaves + +instance Annotated EmbedLink where + ann = \case + EmbedTypeLink name -> ann name + EmbedTermLink name -> ann name + +instance (Annotated code) => Annotated (SourceElement code) where + ann (SourceElement link target) = ann link <> ann target + +instance Annotated EmbedSignatureLink where + ann (EmbedSignatureLink name) = ann name + +instance (Annotated code) => Annotated (EmbedAnnotation code) where + ann (EmbedAnnotation a) = either ann ann a diff --git a/unison-syntax/unison-syntax.cabal b/unison-syntax/unison-syntax.cabal index 4b097e6021..31ee026b7c 100644 --- a/unison-syntax/unison-syntax.cabal +++ b/unison-syntax/unison-syntax.cabal @@ -26,6 +26,7 @@ library Unison.Syntax.Name Unison.Syntax.NameSegment Unison.Syntax.Parser + Unison.Syntax.Parser.Doc.Data Unison.Syntax.ReservedWords Unison.Syntax.ShortHash Unison.Syntax.Var From 70fe615570ed57026fb58b8939a643d9155e58b0 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 25 Jul 2024 14:33:40 -0600 Subject: [PATCH 14/22] Add `Data.Functor.Classes` instances These are needed for the new Doc types, but had been stubbed out. Moving the Doc types to their own module forced the changes that got in the way of generating these with Template Haskell. --- .../src/Unison/Syntax/TermParser.hs | 4 +- unison-syntax/package.yaml | 1 + .../src/Unison/Syntax/Parser/Doc/Data.hs | 90 +++++++++++-------- unison-syntax/unison-syntax.cabal | 2 + 4 files changed, 57 insertions(+), 40 deletions(-) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 89d5504079..6433bf220c 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -598,7 +598,7 @@ doc2Block = do Doc.EmbedTermLink ident -> Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docSourceElement :: Doc.SourceElement [L.Token L.Lexeme] -> TermP v m + docSourceElement :: Doc.SourceElement (Doc.Leaf [L.Token L.Lexeme] Void) -> TermP v m docSourceElement d@(Doc.SourceElement link anns) = do link' <- docEmbedLink link anns' <- traverse docEmbedAnnotation anns @@ -608,7 +608,7 @@ doc2Block = do docEmbedSignatureLink d@(Doc.EmbedSignatureLink ident) = Term.app (gann d) (f d "EmbedSignatureLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docEmbedAnnotation :: Doc.EmbedAnnotation [L.Token L.Lexeme] -> TermP v m + docEmbedAnnotation :: Doc.EmbedAnnotation (Doc.Leaf [L.Token L.Lexeme] Void) -> TermP v m docEmbedAnnotation d@(Doc.EmbedAnnotation a) = -- This is the only place I’m not sure we’re doing the right thing. In the lexer, this can be an identifier or a -- DocLeaf, but here it could be either /text/ or a Doc element. And I don’t think there’s any way the lexemes diff --git a/unison-syntax/package.yaml b/unison-syntax/package.yaml index ccb1a057d7..b093dc182f 100644 --- a/unison-syntax/package.yaml +++ b/unison-syntax/package.yaml @@ -9,6 +9,7 @@ dependencies: - bytes - containers - cryptonite + - deriving-compat - extra - free - lens diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs index 4a88200b8b..5167b2bcf6 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE TemplateHaskell #-} + -- | Haskell parallel to @unison/base.Doc@. -- -- This is much more restricted than @unison/base.Doc@, but it covers everything we can parse from Haskell. The @@ -7,8 +9,10 @@ -- line. module Unison.Syntax.Parser.Doc.Data where -import Data.Functor.Classes +import Data.Eq.Deriving (deriveEq1, deriveEq2) import Data.List.NonEmpty (NonEmpty) +import Data.Ord.Deriving (deriveOrd1, deriveOrd2) +import Text.Show.Deriving (deriveShow1, deriveShow2) import Unison.HashQualifiedPrime qualified as HQ' import Unison.Name (Name) import Unison.Parser.Ann (Annotated (..)) @@ -19,7 +23,7 @@ newtype UntitledSection a = UntitledSection [a] deriving (Eq, Ord, Show, Foldable, Functor, Traversable) data Top code a - = -- | The first argument is always a Paragraph + = -- | The first argument is always a `Paragraph` Section a [a] | Eval code | ExampleBlock code @@ -29,30 +33,15 @@ data Top code a | Paragraph (NonEmpty (Leaf code a)) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -instance Eq2 Top where - liftEq2 _ _ _ _ = True - -instance (Eq code) => Eq1 (Top code) - -instance Ord2 Top where - liftCompare2 _ _ _ _ = LT - -instance (Ord code) => Ord1 (Top code) - -instance Show2 Top where - liftShowsPrec2 _ _ _ _ _ _ x = x - -instance (Show code) => Show1 (Top code) - data Column a - = -- | The first is always a Paragraph, and the second a Bulleted or Numbered List + = -- | The first is always a `Paragraph`, and the second a `BulletedList` or `NumberedList` Column a (Maybe a) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) data Leaf code a = Link EmbedLink | -- | first is a Paragraph, second is always a Group (which contains either a single Term/Type link or list of - -- Transcludes & Words) + -- `Transclude`s & `Word`s) NamedLink a (Leaf code Void) | Example code | Transclude code @@ -66,13 +55,15 @@ data Leaf code a Verbatim (Leaf Void Void) | -- | Always a Word Code (Leaf Void Void) - | Source (NonEmpty (SourceElement code)) - | FoldedSource (NonEmpty (SourceElement code)) + | -- | Always a Transclude + Source (NonEmpty (SourceElement (Leaf code Void))) + | -- | Always a Transclude + FoldedSource (NonEmpty (SourceElement (Leaf code Void))) | EvalInline code | Signature (NonEmpty EmbedSignatureLink) | SignatureInline EmbedSignatureLink | Word (Token String) - | Group (Join code a) + | Group (Join (Leaf code a)) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) instance Bifunctor Leaf where @@ -86,38 +77,31 @@ instance Bifunctor Leaf where Strikethrough a -> Strikethrough $ g a Verbatim leaf -> Verbatim leaf Code leaf -> Code leaf - Source elems -> Source $ fmap f <$> elems - FoldedSource elems -> FoldedSource $ fmap f <$> elems + Source elems -> Source $ fmap (first f) <$> elems + FoldedSource elems -> FoldedSource $ fmap (first f) <$> elems EvalInline code -> EvalInline $ f code Signature x -> Signature x SignatureInline x -> SignatureInline x Word x -> Word x - Group join -> Group $ bimap f g join + Group join -> Group $ bimap f g <$> join data EmbedLink = EmbedTypeLink (Token (HQ'.HashQualified Name)) | EmbedTermLink (Token (HQ'.HashQualified Name)) deriving (Eq, Ord, Show) -data SourceElement code = SourceElement EmbedLink [EmbedAnnotation code] - deriving (Eq, Ord, Show, Functor) +data SourceElement a = SourceElement EmbedLink [EmbedAnnotation a] + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) newtype EmbedSignatureLink = EmbedSignatureLink (Token (HQ'.HashQualified Name)) deriving (Eq, Ord, Show) -newtype Join code a = Join (NonEmpty (Leaf code a)) +newtype Join a = Join (NonEmpty a) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -instance Bifunctor Join where - bimap f g (Join leaves) = Join $ bimap f g <$> leaves - -newtype EmbedAnnotation code - = -- | Always a Transclude - EmbedAnnotation (Either (Token (HQ'.HashQualified Name)) (Leaf code Void)) - deriving (Eq, Ord, Show) - -instance Functor EmbedAnnotation where - fmap f (EmbedAnnotation ann) = EmbedAnnotation $ first f <$> ann +newtype EmbedAnnotation a + = EmbedAnnotation (Either (Token (HQ'.HashQualified Name)) a) + deriving (Eq, Ord, Show, Foldable, Functor, Traversable) instance (Annotated code, Annotated a) => Annotated (Top code a) where ann = \case @@ -164,3 +148,33 @@ instance Annotated EmbedSignatureLink where instance (Annotated code) => Annotated (EmbedAnnotation code) where ann (EmbedAnnotation a) = either ann ann a + +$(deriveEq1 ''Column) +$(deriveOrd1 ''Column) +$(deriveShow1 ''Column) + +$(deriveEq1 ''EmbedAnnotation) +$(deriveOrd1 ''EmbedAnnotation) +$(deriveShow1 ''EmbedAnnotation) + +$(deriveEq1 ''SourceElement) +$(deriveOrd1 ''SourceElement) +$(deriveShow1 ''SourceElement) + +$(deriveEq1 ''Join) +$(deriveOrd1 ''Join) +$(deriveShow1 ''Join) + +$(deriveEq1 ''Leaf) +$(deriveOrd1 ''Leaf) +$(deriveShow1 ''Leaf) +$(deriveEq2 ''Leaf) +$(deriveOrd2 ''Leaf) +$(deriveShow2 ''Leaf) + +$(deriveEq1 ''Top) +$(deriveOrd1 ''Top) +$(deriveShow1 ''Top) +$(deriveEq2 ''Top) +$(deriveOrd2 ''Top) +$(deriveShow2 ''Top) diff --git a/unison-syntax/unison-syntax.cabal b/unison-syntax/unison-syntax.cabal index 31ee026b7c..853da4c817 100644 --- a/unison-syntax/unison-syntax.cabal +++ b/unison-syntax/unison-syntax.cabal @@ -69,6 +69,7 @@ library , bytes , containers , cryptonite + , deriving-compat , extra , free , lens @@ -127,6 +128,7 @@ test-suite syntax-tests , code-page , containers , cryptonite + , deriving-compat , easytest , extra , free From 31f952201c5a1448af0eb78b5a9dbff63f12c05f Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 25 Jul 2024 15:59:51 -0600 Subject: [PATCH 15/22] Simplify `restoreStack` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It’s only used inside `local`, so its attempts to restore the layout are for naught. --- unison-syntax/src/Unison/Syntax/Lexer.hs | 62 ++++++------------------ 1 file changed, 15 insertions(+), 47 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index c0d1c3c04c..e2cba29dc4 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -257,7 +257,7 @@ token'' tok p = do topHasClosePair :: Layout -> Bool topHasClosePair [] = False topHasClosePair ((name, _) : _) = - name `elem` ["syntax.docTransclude", "{", "(", "[", "handle", "match", "if", "then"] + name `elem` ["DUMMY", "{", "(", "[", "handle", "match", "if", "then"] showErrorFancy :: (P.ShowErrorComponent e) => P.ErrorFancy e -> String showErrorFancy = \case @@ -394,22 +394,6 @@ infixl 2 <+> (<+>) :: (Monoid a) => P a -> P a -> P a p1 <+> p2 = do a1 <- p1; a2 <- p2; pure (a1 <> a2) --- Runs the parser `p`, then: --- 1. resets the layout stack to be what it was before `p`. --- 2. emits enough closing tokens to reach `lbl` but not pop it. --- (you can think of this as just dealing with a final "unclosed" --- block at the end of `p`) -restoreStack :: String -> P [Token Lexeme] -> P [Token Lexeme] -restoreStack lbl p = do - layout1 <- S.gets layout - p <- p - s2 <- S.get - let (pos1, pos2) = foldl' (\_ b -> (start b, end b)) mempty p - unclosed = takeWhile (\(lbl', _) -> lbl' /= lbl) (layout s2) - closes = replicate (length unclosed) (Token Close pos1 pos2) - S.put (s2 {layout = layout1}) - pure $ p <> closes - type DocTree = Cofree (Doc.Top [Token Lexeme]) Ann -- | The `Doc` lexer as documented on unison-lang.org @@ -501,7 +485,7 @@ sepBy1' p sep = liftA2 (:|) p . many $ sep *> p -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). docBody :: P end -> P (Doc.UntitledSection DocTree) -docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) +docBody docClose = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) where wordyKw kw = separated wordySep (lit kw) sectionElem = section <|> fencedBlock <|> list <|> paragraph @@ -626,8 +610,6 @@ docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) ex <- CP.space *> lexemes' end pure ex - docClose = [] <$ docClose' - link = P.label "link (examples: {type List}, {Nat.+})" $ fmap Doc.Link $ @@ -636,20 +618,7 @@ docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) expr = fmap Doc.Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ - openAs "{{" "syntax.docTransclude" - *> do - env0 <- S.get - -- we re-allow layout within a transclusion, then restore it to its - -- previous state after - S.put (env0 {inLayout = True}) - -- Note: this P.lookAhead ensures the }} isn't consumed, - -- so it can be consumed below by the `close` which will - -- pop items off the layout stack up to the nearest enclosing - -- syntax.docTransclude. - ts <- lexemes' (P.lookAhead ([] <$ lit "}}")) - S.modify (\env -> env {inLayout = inLayout env0}) - pure ts - <* close ["syntax.docTransclude"] (lit "}}") + lit "{{" *> lexemes' ([] <$ lit "}}") nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace @@ -673,16 +642,12 @@ docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) fence <$ guard b CP.space - *> local - (\env -> env {inLayout = True, opening = Just "docEval"}) - (restoreStack "docEval" $ lexemes' ([] <$ lit fence)) + *> lexemes' ([] <$ lit fence) exampleBlock = fmap (wrap' . Doc.ExampleBlock) $ do void $ lit "@typecheck" <* CP.space fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - local - (\env -> env {inLayout = True, opening = Just "docExampleBlock"}) - (restoreStack "docExampleBlock" $ lexemes' ([] <$ lit fence)) + lexemes' $ [] <$ lit fence uncolumn column tabWidth s = let skip col r | col < 1 = r @@ -855,10 +820,16 @@ docBody docClose' = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) wrapSimple2 fn a b = ann a <> ann b :< fn a b lexemes' :: P [Token Lexeme] -> P [Token Lexeme] -lexemes' = +lexemes' eof = -- NB: `postLex` requires the token stream to start with an `Open`, otherwise it can’t create a `T`, so this adds one, -- runs `postLex`, then removes it. - fmap (tail . postLex . (Token (Open "fake") mempty mempty :)) . lexemes + fmap (tail . postLex . (Token (Open "fake") mempty mempty :)) $ + local (\env -> env {inLayout = True, opening = Just "DUMMY"}) do + p <- lexemes eof + -- deals with a final "unclosed" block at the end of `p`) + unclosed <- takeWhile (("DUMMY" /=) . fst) . layout <$> S.get + let pos = end $ last p + pure $ p <> replicate (length unclosed) (Token Close pos pos) -- | Consumes an entire Unison “module”. lexemes :: P [Token Lexeme] -> P [Token Lexeme] @@ -1245,11 +1216,8 @@ separated :: (Char -> Bool) -> P a -> P a separated ok p = P.try $ p <* P.lookAhead (void (P.satisfy ok) <|> P.eof) open :: String -> P [Token Lexeme] -open b = openAs b b - -openAs :: String -> String -> P [Token Lexeme] -openAs syntax b = do - token <- tokenP $ lit syntax +open b = do + token <- tokenP $ lit b env <- S.get S.put (env {opening = Just b}) pure [Open b <$ token] From 6f2d188e5c8a0edae756046c7adaa0fbd9581407 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 25 Jul 2024 14:35:26 -0600 Subject: [PATCH 16/22] Split Doc parser from Unison lexer --- parser-typechecker/src/Unison/PrintError.hs | 2 +- .../src/Unison/Syntax/TermParser.hs | 2 +- .../src/Unison/Syntax/TermPrinter.hs | 2 +- .../src/Unison/Codebase/Editor/HandleInput.hs | 5 +- unison-cli/src/Unison/LSP/FileAnalysis.hs | 2 +- unison-cli/src/Unison/LSP/Types.hs | 2 +- unison-syntax/src/Unison/Syntax/Lexer.hs | 1354 +---------------- .../src/Unison/Syntax/Lexer/Unison.hs | 910 +++++++++++ unison-syntax/src/Unison/Syntax/Parser.hs | 5 +- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 476 ++++++ unison-syntax/test/Main.hs | 2 +- unison-syntax/unison-syntax.cabal | 2 + 12 files changed, 1430 insertions(+), 1334 deletions(-) create mode 100644 unison-syntax/src/Unison/Syntax/Lexer/Unison.hs create mode 100644 unison-syntax/src/Unison/Syntax/Parser/Doc.hs diff --git a/parser-typechecker/src/Unison/PrintError.hs b/parser-typechecker/src/Unison/PrintError.hs index 8b73b179f1..dd796c0159 100644 --- a/parser-typechecker/src/Unison/PrintError.hs +++ b/parser-typechecker/src/Unison/PrintError.hs @@ -54,7 +54,7 @@ import Unison.Result qualified as Result import Unison.Settings qualified as Settings import Unison.Symbol (Symbol) import Unison.Syntax.HashQualified qualified as HQ (toText) -import Unison.Syntax.Lexer qualified as L +import Unison.Syntax.Lexer.Unison qualified as L import Unison.Syntax.Name qualified as Name (toText) import Unison.Syntax.NamePrinter (prettyHashQualified0) import Unison.Syntax.Parser (Annotated, ann) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 6433bf220c..4c3069b9ff 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -47,7 +47,7 @@ import Unison.Pattern qualified as Pattern import Unison.Prelude import Unison.Reference (Reference) import Unison.Referent (Referent) -import Unison.Syntax.Lexer qualified as L +import Unison.Syntax.Lexer.Unison qualified as L import Unison.Syntax.Name qualified as Name (toText, toVar, unsafeParseVar) import Unison.Syntax.NameSegment qualified as NameSegment import Unison.Syntax.Parser hiding (seq) diff --git a/parser-typechecker/src/Unison/Syntax/TermPrinter.hs b/parser-typechecker/src/Unison/Syntax/TermPrinter.hs index faeda76020..5c41701bf8 100644 --- a/parser-typechecker/src/Unison/Syntax/TermPrinter.hs +++ b/parser-typechecker/src/Unison/Syntax/TermPrinter.hs @@ -51,7 +51,7 @@ import Unison.Reference qualified as Reference import Unison.Referent (Referent) import Unison.Referent qualified as Referent import Unison.Syntax.HashQualified qualified as HQ (unsafeFromVar) -import Unison.Syntax.Lexer (showEscapeChar) +import Unison.Syntax.Lexer.Unison (showEscapeChar) import Unison.Syntax.Name qualified as Name (isSymboly, parseText, parseTextEither, toText, unsafeParseText) import Unison.Syntax.NamePrinter (styleHashQualified'') import Unison.Syntax.NameSegment qualified as NameSegment (toEscapedText) diff --git a/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs b/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs index e85879cc4a..e17d3fdd9e 100644 --- a/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs +++ b/unison-cli/src/Unison/Codebase/Editor/HandleInput.hs @@ -151,8 +151,7 @@ import Unison.ShortHash qualified as SH import Unison.Sqlite qualified as Sqlite import Unison.Symbol (Symbol) import Unison.Syntax.HashQualified qualified as HQ (parseTextWith, toText) -import Unison.Syntax.Lexer qualified as L -import Unison.Syntax.Lexer qualified as Lexer +import Unison.Syntax.Lexer.Unison qualified as L import Unison.Syntax.Name qualified as Name (toText, toVar, unsafeParseVar) import Unison.Syntax.NameSegment qualified as NameSegment import Unison.Syntax.Parser qualified as Parser @@ -1137,7 +1136,7 @@ handleFindI isVerbose fscope ws input = do -- name query qs -> do - let anythingBeforeHash :: Megaparsec.Parsec (Lexer.Token Text) [Char] Text + let anythingBeforeHash :: Megaparsec.Parsec (L.Token Text) [Char] Text anythingBeforeHash = Text.pack <$> Megaparsec.takeWhileP Nothing (/= '#') let srs = searchBranchScored diff --git a/unison-cli/src/Unison/LSP/FileAnalysis.hs b/unison-cli/src/Unison/LSP/FileAnalysis.hs index 7a7ae006cf..bec9f8bf9f 100644 --- a/unison-cli/src/Unison/LSP/FileAnalysis.hs +++ b/unison-cli/src/Unison/LSP/FileAnalysis.hs @@ -57,7 +57,7 @@ import Unison.Result (Note) import Unison.Result qualified as Result import Unison.Symbol (Symbol) import Unison.Syntax.HashQualifiedPrime qualified as HQ' (toText) -import Unison.Syntax.Lexer qualified as L +import Unison.Syntax.Lexer.Unison qualified as L import Unison.Syntax.Name qualified as Name import Unison.Syntax.Parser qualified as Parser import Unison.Syntax.TypePrinter qualified as TypePrinter diff --git a/unison-cli/src/Unison/LSP/Types.hs b/unison-cli/src/Unison/LSP/Types.hs index b368e915ef..268034ea5a 100644 --- a/unison-cli/src/Unison/LSP/Types.hs +++ b/unison-cli/src/Unison/LSP/Types.hs @@ -41,7 +41,7 @@ import Unison.Server.Backend qualified as Backend import Unison.Server.NameSearch (NameSearch) import Unison.Sqlite qualified as Sqlite import Unison.Symbol -import Unison.Syntax.Lexer qualified as Lexer +import Unison.Syntax.Lexer.Unison qualified as Lexer import Unison.Type (Type) import Unison.UnisonFile qualified as UF import Unison.UnisonFile.Summary (FileSummary (..)) diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index e2cba29dc4..cfd932cd7e 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -1,21 +1,13 @@ {-# LANGUAGE TemplateHaskell #-} {-# OPTIONS_GHC -Wno-orphans #-} +-- | This currently contains a mix of general lexing utilities and identifier-y lexers. module Unison.Syntax.Lexer ( Token (..), Line, Column, Err (..), Pos (..), - Lexeme (..), - DocTree, - lexer, - preParse, - escapeChars, - debugFilePreParse, - debugPreParse, - debugPreParse', - showEscapeChar, touches, -- * Character classifiers @@ -23,28 +15,40 @@ module Unison.Syntax.Lexer wordyIdStartChar, symbolyIdChar, - -- * Error formatting - formatTrivialError, - displayLexeme, + -- * new exports + BlockName, + Layout, + ParsingEnv (..), + P, + local, + parseFailure, + space, + lit, + err, + commitAfter2, + (<+>), + some', + someTill', + sepBy1', + separated, + wordySep, + identifierP, + wordyIdSegP, + shortHashP, + topBlockName, + pop, + typeOrAbilityAlt, + typeModifiersAlt, + inc, ) where import Control.Comonad.Cofree (Cofree ((:<))) -import Control.Lens qualified as Lens import Control.Monad.State qualified as S -import Data.Char (isAlphaNum, isControl, isDigit, isSpace, ord, toLower) -import Data.Foldable qualified as Foldable -import Data.List qualified as List -import Data.List.Extra qualified as List +import Data.Char (isSpace) import Data.List.NonEmpty (NonEmpty ((:|))) -import Data.List.NonEmpty qualified as Nel -import Data.List.NonEmpty qualified as NonEmpty -import Data.Map.Strict qualified as Map -import Data.Set qualified as Set import Data.Text qualified as Text -import GHC.Exts (sortWith) import Text.Megaparsec qualified as P -import Text.Megaparsec.Char (char) import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Text.Megaparsec.Error qualified as EP @@ -52,24 +56,16 @@ import Text.Megaparsec.Internal qualified as PI import Unison.HashQualifiedPrime qualified as HQ' import Unison.Lexer.Pos (Column, Line, Pos (Pos), column, line) import Unison.Name (Name) -import Unison.Name qualified as Name import Unison.NameSegment (NameSegment) -import Unison.NameSegment qualified as NameSegment (docSegment) -import Unison.NameSegment.Internal qualified as NameSegment -import Unison.Parser.Ann (Ann, Annotated (..)) +import Unison.Parser.Ann (Annotated (..)) import Unison.Prelude import Unison.ShortHash (ShortHash) -import Unison.ShortHash qualified as SH -import Unison.Syntax.HashQualifiedPrime qualified as HQ' (toText) -import Unison.Syntax.Lexer.Token (Token (..), posP, tokenP) -import Unison.Syntax.Name qualified as Name (isSymboly, nameP, toText, unsafeParseText) +import Unison.Syntax.Lexer.Token (Token (..), posP) +import Unison.Syntax.Name qualified as Name (nameP) import Unison.Syntax.NameSegment (symbolyIdChar, wordyIdChar, wordyIdStartChar) import Unison.Syntax.NameSegment qualified as NameSegment (ParseErr (..), wordyP) -import Unison.Syntax.Parser.Doc.Data qualified as Doc -import Unison.Syntax.ReservedWords (delimiters, typeModifiers, typeOrAbility) +import Unison.Syntax.ReservedWords (typeModifiers, typeOrAbility) import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) -import Unison.Util.Bytes qualified as Bytes -import Unison.Util.Monoid (intercalateMap) instance (Annotated a) => Annotated (Cofree f a) where ann (a :< _) = ann a @@ -128,29 +124,6 @@ data Err | UnexpectedTokens String -- Catch-all for all other lexer errors, representing some unexpected tokens. deriving stock (Eq, Ord, Show) -- richer algebra --- Design principle: --- `[Lexeme]` should be sufficient information for parsing without --- further knowledge of spacing or indentation levels --- any knowledge of comments -data Lexeme - = Open String -- start of a block - | Semi IsVirtual -- separator between elements of a block - | Close -- end of a block - | Reserved String -- reserved tokens such as `{`, `(`, `type`, `of`, etc - | Textual String -- text literals, `"foo bar"` - | Character Char -- character literals, `?X` - | WordyId (HQ'.HashQualified Name) -- a (non-infix) identifier. invariant: last segment is wordy - | SymbolyId (HQ'.HashQualified Name) -- an infix identifier. invariant: last segment is symboly - | Blank String -- a typed hole or placeholder - | Numeric String -- numeric literals, left unparsed - | Bytes Bytes.Bytes -- bytes literals - | Hash ShortHash -- hash literals - | Err Err - | Doc (Doc.UntitledSection DocTree) - deriving stock (Eq, Show, Ord) - -type IsVirtual = Bool -- is it a virtual semi or an actual semi? - space :: P () space = LP.space @@ -163,15 +136,6 @@ space = lit :: String -> P String lit = P.try . LP.symbol (pure ()) -token :: P Lexeme -> P [Token Lexeme] -token = token' (\a start end -> [Token a start end]) - --- Token parser: strips trailing whitespace and comments after a --- successful parse, and also takes care of emitting layout tokens --- (such as virtual semicolons and closing tokens). -token' :: (a -> Pos -> Pos -> [Token Lexeme]) -> P a -> P [Token Lexeme] -token' tok p = LP.lexeme space (token'' tok p) - -- Committed failure err :: Pos -> Err -> P x err start t = do @@ -193,283 +157,11 @@ commitAfter2 a b f = do (a, b) <- P.try $ liftA2 (,) a b f a b --- Token parser implementation which leaves trailing whitespace and comments --- but does emit layout tokens such as virtual semicolons and closing tokens. -token'' :: (a -> Pos -> Pos -> [Token Lexeme]) -> P a -> P [Token Lexeme] -token'' tok p = do - start <- posP - -- We save the current state so we can backtrack the state if `p` fails. - env <- S.get - layoutToks <- case opening env of - -- If we're opening a block named b, we push (b, currentColumn) onto - -- the layout stack. Example: - -- - -- blah = cases - -- {- A comment -} - -- -- A one-line comment - -- 0 -> "hi" - -- 1 -> "bye" - -- - -- After the `cases` token, the state will be opening = Just "cases", - -- meaning the parser is searching for the next non-whitespace/comment - -- character to determine the leftmost column of the `cases` block. - -- That will be the column of the `0`. - Just blockname -> - -- special case - handling of empty blocks, as in: - -- foo = - -- bar = 42 - if blockname == "=" && column start <= top l && not (null l) - then do - S.put (env {layout = (blockname, column start + 1) : l, opening = Nothing}) - pops start - else [] <$ S.put (env {layout = layout', opening = Nothing}) - where - layout' = (blockname, column start) : l - l = layout env - -- If we're not opening a block, we potentially pop from - -- the layout stack and/or emit virtual semicolons. - Nothing -> if inLayout env then pops start else pure [] - beforeTokenPos <- posP - a <- p <|> (S.put env >> fail "resetting state") - endPos <- posP - pure $ layoutToks ++ tok a beforeTokenPos endPos - where - pops :: Pos -> P [Token Lexeme] - pops p = do - env <- S.get - let l = layout env - if top l == column p && topContainsVirtualSemis l - then pure [Token (Semi True) p p] - else - if column p > top l || topHasClosePair l - then pure [] - else - if column p < top l - then S.put (env {layout = pop l}) >> ((Token Close p p :) <$> pops p) - else error "impossible" - - -- don't emit virtual semis in (, {, or [ blocks - topContainsVirtualSemis :: Layout -> Bool - topContainsVirtualSemis = \case - [] -> False - ((name, _) : _) -> name /= "(" && name /= "{" && name /= "[" - - topHasClosePair :: Layout -> Bool - topHasClosePair [] = False - topHasClosePair ((name, _) : _) = - name `elem` ["DUMMY", "{", "(", "[", "handle", "match", "if", "then"] - -showErrorFancy :: (P.ShowErrorComponent e) => P.ErrorFancy e -> String -showErrorFancy = \case - P.ErrorFail msg -> msg - P.ErrorIndentation ord ref actual -> - "incorrect indentation (got " - <> show (P.unPos actual) - <> ", should be " - <> p - <> show (P.unPos ref) - <> ")" - where - p = case ord of - LT -> "less than " - EQ -> "equal to " - GT -> "greater than " - P.ErrorCustom a -> P.showErrorComponent a - -lexer :: String -> String -> [Token Lexeme] -lexer scope rem = - case flip S.evalState env0 $ P.runParserT (lexemes eof) scope rem of - Left e -> - let errsWithSourcePos = - fst $ - P.attachSourcePos - P.errorOffset - (toList (P.bundleErrors e)) - (P.bundlePosState e) - errorToTokens :: (EP.ParseError String (Token Err), P.SourcePos) -> [Token Lexeme] - errorToTokens (err, top) = case err of - P.FancyError _ (customErrs -> es) | not (null es) -> es - P.FancyError _errOffset es -> - let msg = intercalateMap "\n" showErrorFancy es - in [Token (Err (UnexpectedTokens msg)) (toPos top) (toPos top)] - P.TrivialError _errOffset mayUnexpectedTokens expectedTokens -> - let unexpectedStr :: Set String - unexpectedStr = - mayUnexpectedTokens - & fmap errorItemToString - & maybeToList - & Set.fromList - errorLength :: Int - errorLength = case Set.toList unexpectedStr of - [] -> 0 - (x : _) -> length x - expectedStr :: Set String - expectedStr = - expectedTokens - & Set.map errorItemToString - err = UnexpectedTokens $ formatTrivialError unexpectedStr expectedStr - startPos = toPos top - -- This is just an attempt to highlight errors better in source excerpts. - -- It may not work in all cases, but should generally provide a better experience. - endPos = startPos & \(Pos l c) -> Pos l (c + errorLength) - in [Token (Err err) startPos endPos] - in errsWithSourcePos >>= errorToTokens - Right ts -> postLex $ Token (Open scope) topLeftCorner topLeftCorner : ts - where - eof :: P [Token Lexeme] - eof = P.try do - p <- P.eof >> posP - n <- maybe 0 (const 1) <$> S.gets opening - l <- S.gets layout - pure $ replicate (length l + n) (Token Close p p) - errorItemToString :: EP.ErrorItem Char -> String - errorItemToString = \case - (P.Tokens ts) -> Foldable.toList ts - (P.Label ts) -> Foldable.toList ts - (P.EndOfInput) -> "end of input" - customErrs es = [Err <$> e | P.ErrorCustom e <- toList es] - toPos (P.SourcePos _ line col) = Pos (P.unPos line) (P.unPos col) - env0 = ParsingEnv [] (Just scope) True [0] 0 - --- | hacky postprocessing pass to do some cleanup of stuff that's annoying to --- fix without adding more state to the lexer: --- - 1+1 lexes as [1, +1], convert this to [1, +, 1] --- - when a semi followed by a virtual semi, drop the virtual, lets you --- write --- foo x = action1; --- 2 --- - semi immediately after first Open is ignored -tweak :: (Token Lexeme) -> [Token Lexeme] -> [Token Lexeme] -tweak h@(Token (Semi False) _ _) (Token (Semi True) _ _ : t) = h : t --- __NB__: This case only exists to guard against the following one -tweak h@(Token (Reserved _) _ _) t = h : t -tweak t1 (t2@(Token (Numeric num) _ _) : rem) - | notLayout t1 && touches t1 t2 && isSigned num = - t1 - : Token - (SymbolyId (HQ'.fromName (Name.unsafeParseText (Text.pack (take 1 num))))) - (start t2) - (inc $ start t2) - : Token (Numeric (drop 1 num)) (inc $ start t2) (end t2) - : rem - where - isSigned num = all (\ch -> ch == '-' || ch == '+') $ take 1 num -tweak h t = h : t - -formatTrivialError :: Set String -> Set String -> [Char] -formatTrivialError unexpectedTokens expectedTokens = - let unexpectedMsg = case Set.toList unexpectedTokens of - [] -> "I found something I didn't expect." - [x] -> - let article = case x of - (c : _) | c `elem` ("aeiou" :: String) -> "an" - _ -> "a" - in "I was surprised to find " <> article <> " " <> x <> " here." - xs -> "I was surprised to find these:\n\n* " <> List.intercalate "\n* " xs - expectedMsg = case Set.toList expectedTokens of - [] -> Nothing - xs -> Just $ "\nI was expecting one of these instead:\n\n* " <> List.intercalate "\n* " xs - in concat $ catMaybes [Just unexpectedMsg, expectedMsg] - -displayLexeme :: Lexeme -> String -displayLexeme = \case - Open o -> o - Semi True -> "end of stanza" - Semi False -> "semicolon" - Close -> "end of section" - Reserved r -> "'" <> r <> "'" - Textual t -> "\"" <> t <> "\"" - Character c -> "?" <> [c] - WordyId hq -> Text.unpack (HQ'.toTextWith Name.toText hq) - SymbolyId hq -> Text.unpack (HQ'.toTextWith Name.toText hq) - Blank b -> b - Numeric n -> n - Bytes _b -> "bytes literal" - Hash h -> Text.unpack (SH.toText h) - Err e -> show e - Doc _ -> "doc structure" - infixl 2 <+> (<+>) :: (Monoid a) => P a -> P a -> P a p1 <+> p2 = do a1 <- p1; a2 <- p2; pure (a1 <> a2) -type DocTree = Cofree (Doc.Top [Token Lexeme]) Ann - --- | The `Doc` lexer as documented on unison-lang.org -doc2 :: P [Token Lexeme] -doc2 = do - -- Ensure we're at a doc before we start consuming tokens - P.lookAhead (lit "{{") - openStart <- posP - -- Produce any layout tokens, such as closing the last open block or virtual semicolons - -- We don't use 'token' on "{{" directly because we don't want to duplicate layout - -- tokens if we do the rewrite hack for type-docs below. - beforeStartToks <- token' ignore (pure ()) - void $ lit "{{" - openEnd <- posP - CP.space - env0 <- S.get - -- Disable layout while parsing the doc block and reset the section number - (docTok, closeTok) <- local - ( \env -> - env - { inLayout = False, - parentSections = 0 : (parentSections env0) - } - ) - do - body <- docBody (lit "}}") - closeStart <- posP - lit "}}" - closeEnd <- posP - pure (Token (Doc body) openStart closeEnd, Token Close closeStart closeEnd) - -- Parse any layout tokens after the doc block, e.g. virtual semicolon - endToks <- token' ignore (pure ()) - -- Hack to allow anonymous doc blocks before type decls - -- {{ Some docs }} Foo.doc = {{ Some docs }} - -- ability Foo where => ability Foo where - -- - -- __FIXME__: This should be done _after_ parsing, not in lexing. - tn <- subsequentTypeName - pure $ - beforeStartToks <> case (tn) of - -- If we're followed by a type, we rewrite the doc block to be a named doc block. - Just (WordyId tname) - | isTopLevel -> - Token (WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment))) openStart openEnd - : Token (Open "=") openStart openEnd - : docTok - -- We need an extra 'Close' here because we added an extra Open above. - : closeTok - : endToks - where - isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 - _ -> docTok : endToks - where - -- DUPLICATED - wordyKw kw = separated wordySep (lit kw) - subsequentTypeName = P.lookAhead . P.optional $ do - let lit' s = lit s <* sp - let modifier = typeModifiersAlt (lit' . Text.unpack) - let typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) - _ <- optional modifier *> typeOrAbility' *> sp - Token name start stop <- tokenP identifierP - if Name.isSymboly (HQ'.toName name) - then P.customFailure (Token (InvalidSymbolyId (Text.unpack (HQ'.toTextWith Name.toText name))) start stop) - else pure (WordyId name) - ignore _ _ _ = [] - -- DUPLICATED - sp = P.try $ do - spaces <- P.takeWhile1P (Just "space") isSpace - close <- P.optional (P.lookAhead (lit "}}")) - case close of - Nothing -> guard $ ok spaces - Just _ -> pure () - pure spaces - where - ok s = length [() | '\n' <- s] < 2 - -- | Like `P.some`, but returns an actual `NonEmpty`. some' :: P a -> P (NonEmpty a) some' p = liftA2 (:|) p $ many p @@ -482,761 +174,12 @@ someTill' p end = liftA2 (:|) p $ P.manyTill p end sepBy1' :: P a -> P sep -> P (NonEmpty a) sepBy1' p sep = liftA2 (:|) p . many $ sep *> p --- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that --- Unison wraps `Doc` literals in `}}`). -docBody :: P end -> P (Doc.UntitledSection DocTree) -docBody docClose = Doc.UntitledSection <$> P.many (sectionElem <* CP.space) - where - wordyKw kw = separated wordySep (lit kw) - sectionElem = section <|> fencedBlock <|> list <|> paragraph - paragraph = wrap' . Doc.Paragraph <$> spaced leaf - reserved word = List.isPrefixOf "}}" word || all (== '#') word - - wordy :: P end -> P (Doc.Leaf [Token Lexeme] void) - wordy closing = fmap Doc.Word . tokenP . P.try $ do - let end = - P.lookAhead $ - void docClose - <|> void (P.satisfy isSpace) - <|> void closing - word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end - guard (not $ reserved word || null word) - pure word - - leafy closing = groupy closing gs - where - gs = - link - <|> externalLink - <|> exampleInline - <|> expr - <|> boldOrItalicOrStrikethrough closing - <|> verbatim - <|> atDoc - <|> wordy closing - - leaf = leafy mzero - - atDoc = src <|> evalInline <|> signature <|> signatureInline - where - comma = lit "," <* CP.space - src = - src' Doc.Source "@source" - <|> src' Doc.FoldedSource "@foldedSource" - srcElem = - Doc.SourceElement - <$> (typeLink <|> termLink) - <*> ( fmap (fromMaybe []) . P.optional $ - (lit "@") *> (CP.space *> annotations) - ) - where - annotation = fmap Left (tokenP identifierP) <|> fmap Right expr <* CP.space - annotations = - P.some (Doc.EmbedAnnotation <$> annotation) - src' name atName = fmap name $ do - _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space - s <- sepBy1' srcElem comma - _ <- lit "}" - pure s - signature = fmap Doc.Signature $ do - _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space - s <- sepBy1' signatureLink comma - _ <- lit "}" - pure s - signatureInline = fmap Doc.SignatureInline $ do - _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space - s <- signatureLink - _ <- lit "}" - pure s - evalInline = fmap Doc.EvalInline $ do - _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space - let inlineEvalClose = [] <$ lit "}" - s <- lexemes' inlineEvalClose - pure s - - typeLink = fmap Doc.EmbedTypeLink $ do - _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tokenP identifierP <* CP.space - - termLink = - fmap Doc.EmbedTermLink $ - tokenP identifierP <* CP.space - - signatureLink = - fmap Doc.EmbedSignatureLink $ - tokenP identifierP <* CP.space - - groupy closing p = do - Token p _ _ <- tokenP p - after <- P.optional . P.try $ leafy closing - pure $ case after of - Nothing -> p - Just after -> - Doc.Group - . Doc.Join - $ p - :| pure after - - verbatim = - P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do - Token originalText start stop <- tokenP do - -- a single backtick followed by a non-backtick is treated as monospaced - let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) - -- also two or more ' followed by that number of closing ' - quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) - P.someTill P.anySingle (lit quotes) - let isMultiLine = line start /= line stop - if isMultiLine - then do - let trimmed = (trimAroundDelimiters originalText) - let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed - -- If it's a multi-line verbatim block we trim any whitespace representing - -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - pure . Doc.Verbatim $ - Doc.Word $ - Token txt start stop - else - pure . Doc.Code $ - Doc.Word $ - Token originalText start stop - - exampleInline = - P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ - fmap Doc.Example $ do - n <- P.try $ do - _ <- lit "`" - length <$> P.takeWhile1P (Just "backticks") (== '`') - let end :: P [Token Lexeme] = [] <$ lit (replicate (n + 1) '`') - ex <- CP.space *> lexemes' end - pure ex - - link = - P.label "link (examples: {type List}, {Nat.+})" $ - fmap Doc.Link $ - P.try $ - lit "{" *> (typeLink <|> termLink) <* lit "}" - - expr = - fmap Doc.Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ - lit "{{" *> lexemes' ([] <$ lit "}}") - - nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' - nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace - - -- Allows whitespace or a newline, but not more than two newlines in a row. - whitespaceWithoutParagraphBreak :: P () - whitespaceWithoutParagraphBreak = void do - void nonNewlineSpaces - optional newline >>= \case - Just _ -> void nonNewlineSpaces - Nothing -> pure () - - fencedBlock = - P.label "block eval (syntax: a fenced code block)" $ - evalUnison <|> exampleBlock <|> other - where - evalUnison = fmap (wrap' . Doc.Eval) $ do - -- commit after seeing that ``` is on its own line - fence <- P.try $ do - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) - fence <$ guard b - CP.space - *> lexemes' ([] <$ lit fence) - - exampleBlock = fmap (wrap' . Doc.ExampleBlock) $ do - void $ lit "@typecheck" <* CP.space - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - lexemes' $ [] <$ lit fence - - uncolumn column tabWidth s = - let skip col r | col < 1 = r - skip col s@('\t' : _) | col < tabWidth = s - skip col ('\t' : r) = skip (col - tabWidth) r - skip col (c : r) - | isSpace c && (not $ isControl c) = - skip (col - 1) r - skip _ s = s - in List.intercalate "\n" $ skip column <$> lines s - - other = fmap (uncurry $ wrapSimple2 Doc.CodeBlock) $ do - column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel - let tabWidth = toInteger . P.unPos $ P.defaultTabWidth - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - name <- - P.takeWhileP Nothing nonNewlineSpace - *> tokenP (P.takeWhile1P Nothing (not . isSpace)) - <* P.takeWhileP Nothing nonNewlineSpace - _ <- void CP.eol - verbatim <- - tokenP $ - uncolumn column tabWidth . trimAroundDelimiters - <$> P.someTill P.anySingle ([] <$ lit fence) - pure (name, verbatim) - - boldOrItalicOrStrikethrough closing = do - let start = - some (P.satisfy (== '*')) - <|> some (P.satisfy (== '_')) - <|> some - (P.satisfy (== '~')) - name s = - if take 1 s == "~" - then Doc.Strikethrough - else if take 1 s == "*" then Doc.Bold else Doc.Italic - end <- P.try $ do - end <- start - P.lookAhead (P.satisfy (not . isSpace)) - pure end - name end . wrap' . Doc.Paragraph - <$> someTill' - (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) - (lit end) - - externalLink = - P.label "hyperlink (example: [link name](https://destination.com))" $ - fmap (uncurry Doc.NamedLink) $ do - _ <- lit "[" - p <- leafies (void $ char ']') - _ <- lit "]" - _ <- lit "(" - target <- - fmap (Doc.Group . Doc.Join) $ - fmap pure link <|> some' (expr <|> wordy (char ')')) - _ <- lit ")" - pure (p, target) - - -- newline = P.optional (lit "\r") *> lit "\n" - - sp = P.try $ do - spaces <- P.takeWhile1P (Just "space") isSpace - close <- P.optional (P.lookAhead (lit "}}")) - case close of - Nothing -> guard $ ok spaces - Just _ -> pure () - pure spaces - where - ok s = length [() | '\n' <- s] < 2 - - spaced p = some' (p <* P.optional sp) - leafies close = wrap' . Doc.Paragraph <$> spaced (leafy close) - - list = bulletedList <|> numberedList - - bulletedList = wrap' . Doc.BulletedList <$> sepBy1' bullet listSep - numberedList = wrap' . Doc.NumberedList <$> sepBy1' numberedItem listSep - - listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) - - bulletedStart = P.try $ do - r <- listItemStart' $ [] <$ P.satisfy bulletChar - P.lookAhead (P.satisfy isSpace) - pure r - where - bulletChar ch = ch == '*' || ch == '-' || ch == '+' - - listItemStart' :: P a -> P (Int, a) - listItemStart' gutter = P.try $ do - nonNewlineSpaces - col <- column <$> posP - parentCol <- S.gets parentListColumn - guard (col > parentCol) - (col,) <$> gutter - - numberedStart = - listItemStart' $ P.try (tokenP $ LP.decimal <* lit ".") - - listItemParagraph = fmap (wrap' . Doc.Paragraph) $ do - col <- column <$> posP - some' (leaf <* sep col) - where - -- Trickiness here to support hard line breaks inside of - -- a bulleted list, so for instance this parses as expected: - -- - -- * uno dos - -- tres quatro - -- * alice bob - -- carol dave eve - sep col = do - _ <- nonNewlineSpaces - _ <- - P.optional . P.try $ - newline - *> nonNewlineSpaces - *> do - col2 <- column <$> posP - guard $ col2 >= col - (P.notFollowedBy $ void numberedStart <|> void bulletedStart) - pure () - - numberedItem = P.label msg $ do - (col, s) <- numberedStart - (s,) - <$> ( fmap (uncurry Doc.Column) $ do - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) - pure (p, subList) - ) - where - msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" - - bullet = fmap (uncurry Doc.Column) . P.label "bullet (examples: * item1, - item2)" $ do - (col, _) <- bulletedStart - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local - (\e -> e {parentListColumn = col}) - (P.optional $ listSep *> list) - pure (p, subList) - - newline = P.label "newline" $ lit "\n" <|> lit "\r\n" - - -- ## Section title - -- - -- A paragraph under this section. - -- Part of the same paragraph. Blanklines separate paragraphs. - -- - -- ### A subsection title - -- - -- A paragraph under this subsection. - - -- # A section title (not a subsection) - section :: P DocTree - section = fmap (wrap' . uncurry Doc.Section) $ do - ns <- S.gets parentSections - hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp - title <- paragraph <* CP.space - let m = length hashes + head ns - body <- - local (\env -> env {parentSections = (m : (tail ns))}) $ - P.many (sectionElem <* CP.space) - pure $ (title, body) - - wrap' :: Doc.Top [Token Lexeme] DocTree -> DocTree - wrap' doc = ann doc :< doc - - wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> Doc.Top [Token Lexeme] DocTree) -> a -> b -> DocTree - wrapSimple2 fn a b = ann a <> ann b :< fn a b - -lexemes' :: P [Token Lexeme] -> P [Token Lexeme] -lexemes' eof = - -- NB: `postLex` requires the token stream to start with an `Open`, otherwise it can’t create a `T`, so this adds one, - -- runs `postLex`, then removes it. - fmap (tail . postLex . (Token (Open "fake") mempty mempty :)) $ - local (\env -> env {inLayout = True, opening = Just "DUMMY"}) do - p <- lexemes eof - -- deals with a final "unclosed" block at the end of `p`) - unclosed <- takeWhile (("DUMMY" /=) . fst) . layout <$> S.get - let pos = end $ last p - pure $ p <> replicate (length unclosed) (Token Close pos pos) - --- | Consumes an entire Unison “module”. -lexemes :: P [Token Lexeme] -> P [Token Lexeme] -lexemes eof = - P.optional space >> do - hd <- join <$> P.manyTill toks (P.lookAhead eof) - tl <- eof - pure $ hd <> tl - where - toks :: P [Token Lexeme] - toks = - doc2 - <|> doc - <|> token numeric - <|> token character - <|> reserved - <|> token blank - <|> token identifierLexemeP - <|> (asum . map token) [semi, textual, hash] - - doc :: P [Token Lexeme] - doc = open <+> (CP.space *> fmap fixup body) <+> (close <* space) - where - open = token'' (\t _ _ -> t) $ tok (Open <$> lit "[:") - close = tok (Close <$ lit ":]") - at = lit "@" - -- this removes some trailing whitespace from final textual segment - fixup [] = [] - fixup (Token (Textual (reverse -> txt)) start stop : []) = - [Token (Textual txt') start stop] - where - txt' = reverse (dropWhile (\c -> isSpace c && not (c == '\n')) txt) - fixup (h : t) = h : fixup t - - body :: P [Token Lexeme] - body = txt <+> (atk <|> pure []) - where - ch = (":]" <$ lit "\\:]") <|> ("@" <$ lit "\\@") <|> (pure <$> P.anySingle) - txt = tok (Textual . join <$> P.manyTill ch (P.lookAhead sep)) - sep = void at <|> void close - ref = at *> (tok identifierLexemeP <|> docTyp) - atk = (ref <|> docTyp) <+> body - docTyp = do - _ <- lit "[" - typ <- tok (P.manyTill P.anySingle (P.lookAhead (lit "]"))) - _ <- lit "]" *> CP.space - t <- tok identifierLexemeP - pure $ (fmap Reserved <$> typ) <> t - - blank = - separated wordySep do - _ <- char '_' - seg <- P.optional wordyIdSegP - pure (Blank (maybe "" (Text.unpack . NameSegment.toUnescapedText) seg)) - - semi = char ';' $> Semi False - textual = Textual <$> quoted - quoted = quotedRaw <|> quotedSingleLine - quotedRaw = do - _ <- lit "\"\"\"" - n <- many (char '"') - _ <- optional (char '\n') -- initial newline is skipped - s <- P.manyTill P.anySingle (lit (replicate (length n + 3) '"')) - col0 <- column <$> posP - let col = col0 - (length n) - 3 -- this gets us first col of closing quotes - let leading = replicate (max 0 (col - 1)) ' ' - -- a last line that's equal to `leading` is ignored, since leading - -- spaces up to `col` are not considered part of the string - let tweak l = case reverse l of - last : rest - | col > 1 && last == leading -> reverse rest - | otherwise -> l - [] -> [] - pure $ case tweak (lines s) of - [] -> s - ls - | all (\l -> List.isPrefixOf leading l || all isSpace l) ls -> List.intercalate "\n" (drop (length leading) <$> ls) - | otherwise -> s - quotedSingleLine = char '"' *> P.manyTill (LP.charLiteral <|> sp) (char '"') - where - sp = lit "\\s" $> ' ' - character = Character <$> (char '?' *> (spEsc <|> LP.charLiteral)) - where - spEsc = P.try (char '\\' *> char 's' $> ' ') - - numeric = bytes <|> otherbase <|> float <|> intOrNat - where - intOrNat = P.try $ num <$> sign <*> LP.decimal - float = do - _ <- P.try (P.lookAhead (sign >> (LP.decimal :: P Int) >> (char '.' <|> char 'e' <|> char 'E'))) -- commit after this - start <- posP - sign <- fromMaybe "" <$> sign - base <- P.takeWhile1P (Just "base") isDigit - decimals <- - P.optional $ - let missingFractional = err start (MissingFractional $ base <> ".") - in liftA2 (<>) (lit ".") (P.takeWhile1P (Just "decimals") isDigit <|> missingFractional) - exp <- P.optional $ do - e <- map toLower <$> (lit "e" <|> lit "E") - sign <- fromMaybe "" <$> optional (lit "+" <|> lit "-") - let missingExp = err start (MissingExponent $ base <> fromMaybe "" decimals <> e <> sign) - exp <- P.takeWhile1P (Just "exponent") isDigit <|> missingExp - pure $ e <> sign <> exp - pure $ Numeric (sign <> base <> fromMaybe "" decimals <> fromMaybe "" exp) - - bytes = do - start <- posP - _ <- lit "0xs" - s <- map toLower <$> P.takeWhileP (Just "hexidecimal character") isAlphaNum - case Bytes.fromBase16 $ Bytes.fromWord8s (fromIntegral . ord <$> s) of - Left _ -> err start (InvalidBytesLiteral $ "0xs" <> s) - Right bs -> pure (Bytes bs) - otherbase = octal <|> hex - octal = do - start <- posP - commitAfter2 sign (lit "0o") $ \sign _ -> - fmap (num sign) LP.octal <|> err start InvalidOctalLiteral - hex = do - start <- posP - commitAfter2 sign (lit "0x") $ \sign _ -> - fmap (num sign) LP.hexadecimal <|> err start InvalidHexLiteral - - num :: Maybe String -> Integer -> Lexeme - num sign n = Numeric (fromMaybe "" sign <> show n) - sign = P.optional (lit "+" <|> lit "-") - - hash = Hash <$> P.try shortHashP - - reserved :: P [Token Lexeme] - reserved = - token' (\ts _ _ -> ts) $ - braces - <|> parens - <|> brackets - <|> commaSeparator - <|> delim - <|> delayOrForce - <|> keywords - <|> layoutKeywords - where - keywords = - -- yes "wordy" - just like a wordy keyword like "true", the literal "." (as in the dot in - -- "forall a. a -> a") is considered the keyword "." so long as it is either followed by EOF, a space, or some - -- non-wordy character (because ".foo" is a single identifier lexeme) - wordyKw "." - <|> symbolyKw ":" - <|> openKw "@rewrite" - <|> symbolyKw "@" - <|> symbolyKw "||" - <|> symbolyKw "|" - <|> symbolyKw "&&" - <|> wordyKw "true" - <|> wordyKw "false" - <|> wordyKw "use" - <|> wordyKw "forall" - <|> wordyKw "∀" - <|> wordyKw "termLink" - <|> wordyKw "typeLink" - - wordyKw s = separated wordySep (kw s) - symbolyKw s = separated (not . symbolyIdChar) (kw s) - - kw :: String -> P [Token Lexeme] - kw s = tokenP (lit s) <&> \token -> [Reserved <$> token] - - layoutKeywords :: P [Token Lexeme] - layoutKeywords = - ifElse - <|> withKw - <|> openKw "match" - <|> openKw "handle" - <|> typ - <|> arr - <|> rewriteArr - <|> eq - <|> openKw "cases" - <|> openKw "where" - <|> openKw "let" - <|> openKw "do" - where - ifElse = - openKw "if" - <|> closeKw' (Just "then") ["if"] (lit "then") - <|> closeKw' (Just "else") ["then"] (lit "else") - modKw = typeModifiersAlt (openKw1 wordySep . Text.unpack) - typeOrAbilityKw = typeOrAbilityAlt (openTypeKw1 . Text.unpack) - typ = modKw <|> typeOrAbilityKw - - withKw = do - [Token _ pos1 pos2] <- wordyKw "with" - env <- S.get - let l = layout env - case findClose ["handle", "match"] l of - Nothing -> err pos1 (CloseWithoutMatchingOpen msgOpen "'with'") - where - msgOpen = "'handle' or 'match'" - Just (withBlock, n) -> do - let b = withBlock <> "-with" - S.put (env {layout = drop n l, opening = Just b}) - let opens = [Token (Open "with") pos1 pos2] - pure $ replicate n (Token Close pos1 pos2) ++ opens - - -- In `structural/unique type` and `structural/unique ability`, - -- only the `structural` or `unique` opens a layout block, - -- and `ability` and `type` are just keywords. - openTypeKw1 t = do - b <- S.gets (topBlockName . layout) - case b of - Just mod | Set.member (Text.pack mod) typeModifiers -> wordyKw t - _ -> openKw1 wordySep t - - -- layout keyword which bumps the layout column by 1, rather than looking ahead - -- to the next token to determine the layout column - openKw1 :: (Char -> Bool) -> String -> P [Token Lexeme] - openKw1 sep kw = do - Token kw pos0 pos1 <- tokenP $ separated sep (lit kw) - S.modify (\env -> env {layout = (kw, column $ inc pos0) : layout env}) - pure [Token (Open kw) pos0 pos1] - - eq = do - [Token _ start end] <- symbolyKw "=" - env <- S.get - case topBlockName (layout env) of - -- '=' does not open a layout block if within a type declaration - Just t | t == "type" || Set.member (Text.pack t) typeModifiers -> pure [Token (Reserved "=") start end] - Just _ -> S.put (env {opening = Just "="}) >> pure [Token (Open "=") start end] - _ -> err start LayoutError - - rewriteArr = do - [Token _ start end] <- symbolyKw "==>" - env <- S.get - S.put (env {opening = Just "==>"}) >> pure [Token (Open "==>") start end] - - arr = do - [Token _ start end] <- symbolyKw "->" - env <- S.get - -- -> introduces a layout block if we're inside a `match with` or `cases` - case topBlockName (layout env) of - Just match | match `elem` matchWithBlocks -> do - S.put (env {opening = Just "->"}) - pure [Token (Open "->") start end] - _ -> pure [Token (Reserved "->") start end] - - -- a bit of lookahead here to reserve }} for closing a documentation block - braces = open "{" <|> close ["{"] p - where - p = do - l <- lit "}" - -- if we're within an existing {{ }} block, inLayout will be false - -- so we can actually allow }} to appear in normal code - inLayout <- S.gets inLayout - when (not inLayout) $ void $ P.lookAhead (P.satisfy (/= '}')) - pure l - matchWithBlocks = ["match-with", "cases"] - parens = open "(" <|> close ["("] (lit ")") - brackets = open "[" <|> close ["["] (lit "]") - -- `allowCommaToClose` determines if a comma should close inner blocks. - -- Currently there is a set of blocks where `,` is not treated specially - -- and it just emits a Reserved ",". There are currently only three: - -- `cases`, `match-with`, and `{` - allowCommaToClose match = not $ match `elem` ("{" : matchWithBlocks) - commaSeparator = do - env <- S.get - case topBlockName (layout env) of - Just match - | allowCommaToClose match -> - blockDelimiter ["[", "("] (lit ",") - _ -> fail "this comma is a pattern separator" - - delim = P.try $ do - ch <- P.satisfy (\ch -> ch /= ';' && Set.member ch delimiters) - pos <- posP - pure [Token (Reserved [ch]) pos (inc pos)] - - delayOrForce = separated ok $ do - token <- tokenP $ P.satisfy isDelayOrForce - pure [token <&> \op -> Reserved [op]] - where - ok c = isDelayOrForce c || isSpace c || isAlphaNum c || Set.member c delimiters || c == '\"' - --- | If it's a multi-line verbatim block we trim any whitespace representing --- indentation from the pretty-printer. --- --- E.g. --- --- @@ --- {{ --- # Heading --- ''' --- code --- indented --- ''' --- }} --- @@ --- --- Should lex to the text literal "code\n indented". --- --- If there's text in the literal that has LESS trailing whitespace than the --- opening delimiters, we don't trim it at all. E.g. --- --- @@ --- {{ --- # Heading --- ''' --- code --- ''' --- }} --- @@ --- --- Is parsed as " code". --- --- Trim the expected amount of whitespace from a text literal: --- >>> trimIndentFromVerbatimBlock 2 " code\n indented" --- "code\n indented" --- --- If the text literal has less leading whitespace than the opening delimiters, --- leave it as-is --- >>> trimIndentFromVerbatimBlock 2 "code\n indented" --- "code\n indented" -trimIndentFromVerbatimBlock :: Int -> String -> String -trimIndentFromVerbatimBlock leadingSpaces txt = fromMaybe txt $ do - List.intercalate "\n" <$> for (lines txt) \line -> do - -- If any 'stripPrefix' fails, we fail and return the unaltered text - case List.stripPrefix (replicate leadingSpaces ' ') line of - Just stripped -> Just stripped - Nothing -> - -- If it was a line with all white-space, just use an empty line, - -- this can happen easily in editors which trim trailing whitespace. - if all isSpace line - then Just "" - else Nothing - --- Trim leading/trailing whitespace from around delimiters, e.g. --- --- {{ --- '''___ <- whitespace here including newline --- text block --- 👇 or here --- __''' --- }} --- >>> trimAroundDelimiters " \n text block \n " --- " text block " --- --- Should leave leading and trailing line untouched if it contains non-whitespace, e.g.: --- --- ''' leading whitespace --- text block --- trailing whitespace: ''' --- >>> trimAroundDelimiters " leading whitespace\n text block \ntrailing whitespace: " --- " leading whitespace\n text block \ntrailing whitespace: " --- --- Should keep trailing newline if it's the only thing on the line, e.g.: --- --- ''' --- newline below --- --- ''' --- >>> trimAroundDelimiters "\nnewline below\n\n" --- "newline below\n\n" -trimAroundDelimiters :: String -> String -trimAroundDelimiters txt = - txt - & ( \s -> - List.breakOn "\n" s - & \case - (prefix, suffix) - | all isSpace prefix -> drop 1 suffix - | otherwise -> prefix <> suffix - ) - & ( \s -> - List.breakOnEnd "\n" s - & \case - (_prefix, "") -> s - (prefix, suffix) - | all isSpace suffix -> dropTrailingNewline prefix - | otherwise -> prefix <> suffix - ) - where - dropTrailingNewline = \case - [] -> [] - (x : xs) -> NonEmpty.init (x NonEmpty.:| xs) - separated :: (Char -> Bool) -> P a -> P a separated ok p = P.try $ p <* P.lookAhead (void (P.satisfy ok) <|> P.eof) -open :: String -> P [Token Lexeme] -open b = do - token <- tokenP $ lit b - env <- S.get - S.put (env {opening = Just b}) - pure [Open b <$ token] - -openKw :: String -> P [Token Lexeme] -openKw s = separated wordySep $ do - token <- tokenP $ lit s - env <- S.get - S.put (env {opening = Just s}) - pure [Open <$> token] - wordySep :: Char -> Bool wordySep c = isSpace c || not (wordyIdChar c) -tok :: P a -> P [Token a] -tok p = do - token <- tokenP p - pure [token] - -- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is -- symboly (comprised of only symbols) or wordy (comprised of only alphanums). -- @@ -1258,23 +201,6 @@ identifierP = do NameSegment.ReservedOperator s -> ReservedSymbolyId (Text.unpack s) NameSegment.ReservedWord s -> ReservedWordyId (Text.unpack s) --- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is --- symboly (comprised of only symbols) or wordy (comprised of only alphanums). --- --- Examples: --- --- foo --- .foo.++.doc --- `.`.`..` (This is a two-segment identifier without a leading dot: "." then "..") -identifierLexemeP :: P Lexeme -identifierLexemeP = identifierLexeme <$> identifierP - -identifierLexeme :: HQ'.HashQualified Name -> Lexeme -identifierLexeme name = - if Name.isSymboly (HQ'.toName name) - then SymbolyId name - else WordyId name - wordyIdSegP :: P NameSegment wordyIdSegP = PI.withParsecT (fmap (ReservedWordyId . Text.unpack)) NameSegment.wordyP @@ -1283,59 +209,11 @@ shortHashP :: P ShortHash shortHashP = PI.withParsecT (fmap (InvalidShortHash . Text.unpack)) ShortHash.shortHashP -blockDelimiter :: [String] -> P String -> P [Token Lexeme] -blockDelimiter open closeP = do - Token close pos1 pos2 <- tokenP closeP - env <- S.get - case findClose open (layout env) of - Nothing -> err pos1 (UnexpectedDelimiter (quote close)) - where - quote s = "'" <> s <> "'" - Just (_, n) -> do - S.put (env {layout = drop (n - 1) (layout env)}) - let delims = [Token (Reserved close) pos1 pos2] - pure $ replicate (n - 1) (Token Close pos1 pos2) ++ delims - -close :: [String] -> P String -> P [Token Lexeme] -close = close' Nothing - -closeKw' :: Maybe String -> [String] -> P String -> P [Token Lexeme] -closeKw' reopenBlockname open closeP = close' reopenBlockname open (separated wordySep closeP) - -close' :: Maybe String -> [String] -> P String -> P [Token Lexeme] -close' reopenBlockname open closeP = do - Token close pos1 pos2 <- tokenP closeP - env <- S.get - case findClose open (layout env) of - Nothing -> err pos1 (CloseWithoutMatchingOpen msgOpen (quote close)) - where - msgOpen = List.intercalate " or " (quote <$> open) - quote s = "'" <> s <> "'" - Just (_, n) -> do - S.put (env {layout = drop n (layout env), opening = reopenBlockname}) - let opens = maybe [] (const $ [Token (Open close) pos1 pos2]) reopenBlockname - pure $ replicate n (Token Close pos1 pos2) ++ opens - -findClose :: [String] -> Layout -> Maybe (String, Int) -findClose _ [] = Nothing -findClose s ((h, _) : tl) = if h `elem` s then Just (h, 1) else fmap (1 +) <$> findClose s tl - -notLayout :: Token Lexeme -> Bool -notLayout t = case payload t of - Close -> False - Semi _ -> False - Open _ -> False - _ -> True - -- `True` if the tokens are adjacent, with no space separating the two touches :: Token a -> Token b -> Bool touches (end -> t) (start -> t2) = line t == line t2 && column t == column t2 -top :: Layout -> Column -top [] = 1 -top ((_, h) : _) = h - -- todo: make Layout a NonEmpty topBlockName :: Layout -> Maybe BlockName topBlockName [] = Nothing @@ -1344,122 +222,6 @@ topBlockName ((name, _) : _) = Just name pop :: [a] -> [a] pop = drop 1 -topLeftCorner :: Pos -topLeftCorner = Pos 1 1 - -data BlockTree a - = Block - -- | The token that opens the block - a - -- | “Stanzas” of nested tokens - [[BlockTree a]] - -- | The closing token, if any - (Maybe a) - | Leaf a - deriving (Functor, Foldable, Traversable) - -headToken :: BlockTree a -> a -headToken (Block a _ _) = a -headToken (Leaf a) = a - -instance (Show a) => Show (BlockTree a) where - show (Leaf a) = show a - show (Block open mid close) = - show open - ++ "\n" - ++ indent " " (intercalateMap "\n" (intercalateMap " " show) mid) - ++ "\n" - ++ maybe "" show close - where - indent by s = by ++ (s >>= go by) - go by '\n' = '\n' : by - go _ c = [c] - -reorderTree :: ([[BlockTree a]] -> [[BlockTree a]]) -> BlockTree a -> BlockTree a -reorderTree f (Block open mid close) = Block open (f (fmap (reorderTree f) <$> mid)) close -reorderTree _ l = l - -tree :: [Token Lexeme] -> BlockTree (Token Lexeme) -tree toks = one toks const - where - one (open@(payload -> Open _) : ts) k = many (Block open . stanzas) [] ts k - one (t : ts) k = k (Leaf t) ts - one [] k = k lastErr [] - where - lastErr = Leaf case drop (length toks - 1) toks of - [] -> Token (Err LayoutError) topLeftCorner topLeftCorner - (t : _) -> t {payload = Err LayoutError} - - many open acc [] k = k (open (reverse acc) Nothing) [] - many open acc (t@(payload -> Close) : ts) k = k (open (reverse acc) $ pure t) ts - many open acc ts k = one ts $ \t ts -> many open (t : acc) ts k - -stanzas :: [BlockTree (Token Lexeme)] -> [[BlockTree (Token Lexeme)]] -stanzas = - toList - . foldr - ( \tok (curr :| stanzas) -> case tok of - Leaf (Token (Semi _) _ _) -> [tok] :| curr : stanzas - _ -> (tok : curr) :| stanzas - ) - ([] :| []) - --- Moves type and ability declarations to the front of the token stream --- and move `use` statements to the front of each block -reorder :: [[BlockTree (Token Lexeme)]] -> [[BlockTree (Token Lexeme)]] -reorder = foldr fixup [] . sortWith f - where - f [] = 3 :: Int - f (t0 : _) = case payload $ headToken t0 of - Open mod | Set.member (Text.pack mod) typeModifiers -> 1 - Open typOrA | Set.member (Text.pack typOrA) typeOrAbility -> 1 - Reserved "use" -> 0 - _ -> 3 :: Int - -- after reordering can end up with trailing semicolon at the end of - -- a block, which we remove with this pass - fixup stanza [] = case Lens.unsnoc stanza of - Nothing -> [] - -- remove any trailing `Semi` from the last non-empty stanza - Just (init, Leaf (Token (Semi _) _ _)) -> [init] - -- don’t touch other stanzas - Just (_, _) -> [stanza] - fixup stanza tail = stanza : tail - --- | This turns the lexeme stream into a tree, reordering some lexeme subsequences. -preParse :: [Token Lexeme] -> BlockTree (Token Lexeme) -preParse = reorderTree reorder . tree - --- | A few transformations that happen between lexing and parsing. --- --- All of these things should move out of the lexer, and be applied in the parse. -postLex :: [Token Lexeme] -> [Token Lexeme] -postLex = toList . preParse . foldr tweak [] - -isDelayOrForce :: Char -> Bool -isDelayOrForce op = op == '\'' || op == '!' - --- Mapping between characters and their escape codes. Use parse/showEscapeChar to convert. -escapeChars :: [(Char, Char)] -escapeChars = - [ ('0', '\0'), - ('a', '\a'), - ('b', '\b'), - ('f', '\f'), - ('n', '\n'), - ('r', '\r'), - ('t', '\t'), - ('v', '\v'), - ('s', ' '), - ('\'', '\''), - ('"', '"'), - ('\\', '\\') - ] - --- Inverse of parseEscapeChar; map a character to its escaped version: -showEscapeChar :: Char -> Maybe Char -showEscapeChar c = - Map.lookup c (Map.fromList [(x, y) | (y, x) <- escapeChars]) - typeOrAbilityAlt :: (Alternative f) => (Text -> f a) -> f a typeOrAbilityAlt f = asum $ map f (toList typeOrAbility) @@ -1471,28 +233,6 @@ typeModifiersAlt f = inc :: Pos -> Pos inc (Pos line col) = Pos line (col + 1) -debugFilePreParse :: FilePath -> IO () -debugFilePreParse file = putStrLn . debugPreParse . preParse . lexer file . Text.unpack =<< readUtf8 file - -debugPreParse :: BlockTree (Token Lexeme) -> String -debugPreParse (Leaf (Token (Err (UnexpectedTokens msg)) start end)) = - (if start == end then msg1 else msg2) <> ":\n" <> msg - where - msg1 = "Error on line " <> show (line start) <> ", column " <> show (column start) - msg2 = - "Error on line " - <> show (line start) - <> ", column " - <> show (column start) - <> " - line " - <> show (line end) - <> ", column " - <> show (column end) -debugPreParse ts = show $ payload <$> ts - -debugPreParse' :: String -> String -debugPreParse' = debugPreParse . preParse . lexer "debugPreParse" - instance EP.ShowErrorComponent (Token Err) where showErrorComponent (Token err _ _) = go err where @@ -1504,35 +244,3 @@ instance EP.ShowErrorComponent (Token Err) where TextLiteralMissingClosingQuote s -> "This text literal missing a closing quote: " <> excerpt s e -> show e excerpt s = if length s < 15 then s else take 15 s <> "..." - -instance P.VisualStream [Token Lexeme] where - showTokens _ xs = - join . Nel.toList . S.evalState (traverse go xs) . end $ Nel.head xs - where - go :: Token Lexeme -> S.State Pos String - go tok = do - prev <- S.get - S.put $ end tok - pure $ pad prev (start tok) ++ pretty (payload tok) - pretty (Open s) = s - pretty (Reserved w) = w - pretty (Textual t) = '"' : t ++ ['"'] - pretty (Character c) = - case showEscapeChar c of - Just c -> "?\\" ++ [c] - Nothing -> '?' : [c] - pretty (WordyId n) = Text.unpack (HQ'.toText n) - pretty (SymbolyId n) = Text.unpack (HQ'.toText n) - pretty (Blank s) = "_" ++ s - pretty (Numeric n) = n - pretty (Hash sh) = show sh - pretty (Err e) = show e - pretty (Bytes bs) = "0xs" <> show bs - pretty Close = "" - pretty (Semi True) = "" - pretty (Semi False) = ";" - pretty (Doc d) = show d - pad (Pos line1 col1) (Pos line2 col2) = - if line1 == line2 - then replicate (col2 - col1) ' ' - else replicate (line2 - line1) '\n' ++ replicate col2 ' ' diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs new file mode 100644 index 0000000000..dcaf9ca6d3 --- /dev/null +++ b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs @@ -0,0 +1,910 @@ +{-# LANGUAGE TemplateHaskell #-} +{-# OPTIONS_GHC -Wno-orphans #-} + +module Unison.Syntax.Lexer.Unison + ( Token (..), + Line, + Column, + Err (..), + Pos (..), + Lexeme (..), + lexer, + preParse, + escapeChars, + debugFilePreParse, + debugPreParse, + debugPreParse', + showEscapeChar, + touches, + + -- * Character classifiers + wordyIdChar, + wordyIdStartChar, + symbolyIdChar, + + -- * Error formatting + formatTrivialError, + displayLexeme, + ) +where + +import Control.Lens qualified as Lens +import Control.Monad.State qualified as S +import Data.Char (isAlphaNum, isDigit, isSpace, ord, toLower) +import Data.Foldable qualified as Foldable +import Data.List qualified as List +import Data.List.NonEmpty (NonEmpty ((:|))) +import Data.List.NonEmpty qualified as Nel +import Data.Map.Strict qualified as Map +import Data.Set qualified as Set +import Data.Text qualified as Text +import GHC.Exts (sortWith) +import Text.Megaparsec qualified as P +import Text.Megaparsec.Char (char) +import Text.Megaparsec.Char qualified as CP +import Text.Megaparsec.Char.Lexer qualified as LP +import Text.Megaparsec.Error qualified as EP +import Unison.HashQualifiedPrime qualified as HQ' +import Unison.Name (Name) +import Unison.Name qualified as Name +import Unison.NameSegment qualified as NameSegment (docSegment) +import Unison.NameSegment.Internal qualified as NameSegment +import Unison.Prelude +import Unison.ShortHash (ShortHash) +import Unison.ShortHash qualified as SH +import Unison.Syntax.HashQualifiedPrime qualified as HQ' (toText) +import Unison.Syntax.Lexer +import Unison.Syntax.Lexer.Token (posP, tokenP) +import Unison.Syntax.Name qualified as Name (isSymboly, toText, unsafeParseText) +import Unison.Syntax.Parser.Doc qualified as Doc +import Unison.Syntax.Parser.Doc.Data qualified as Doc +import Unison.Syntax.ReservedWords (delimiters, typeModifiers, typeOrAbility) +import Unison.Util.Bytes qualified as Bytes +import Unison.Util.Monoid (intercalateMap) + +-- Design principle: +-- `[Lexeme]` should be sufficient information for parsing without +-- further knowledge of spacing or indentation levels +-- any knowledge of comments +data Lexeme + = Open String -- start of a block + | Semi IsVirtual -- separator between elements of a block + | Close -- end of a block + | Reserved String -- reserved tokens such as `{`, `(`, `type`, `of`, etc + | Textual String -- text literals, `"foo bar"` + | Character Char -- character literals, `?X` + | WordyId (HQ'.HashQualified Name) -- a (non-infix) identifier. invariant: last segment is wordy + | SymbolyId (HQ'.HashQualified Name) -- an infix identifier. invariant: last segment is symboly + | Blank String -- a typed hole or placeholder + | Numeric String -- numeric literals, left unparsed + | Bytes Bytes.Bytes -- bytes literals + | Hash ShortHash -- hash literals + | Err Err + | Doc (Doc.UntitledSection (Doc.Tree [Token Lexeme])) + deriving stock (Eq, Show, Ord) + +type IsVirtual = Bool -- is it a virtual semi or an actual semi? + +token :: P Lexeme -> P [Token Lexeme] +token = token' (\a start end -> [Token a start end]) + +-- Token parser: strips trailing whitespace and comments after a +-- successful parse, and also takes care of emitting layout tokens +-- (such as virtual semicolons and closing tokens). +token' :: (a -> Pos -> Pos -> [Token Lexeme]) -> P a -> P [Token Lexeme] +token' tok p = LP.lexeme space (token'' tok p) + +-- Token parser implementation which leaves trailing whitespace and comments +-- but does emit layout tokens such as virtual semicolons and closing tokens. +token'' :: (a -> Pos -> Pos -> [Token Lexeme]) -> P a -> P [Token Lexeme] +token'' tok p = do + start <- posP + -- We save the current state so we can backtrack the state if `p` fails. + env <- S.get + layoutToks <- case opening env of + -- If we're opening a block named b, we push (b, currentColumn) onto + -- the layout stack. Example: + -- + -- blah = cases + -- {- A comment -} + -- -- A one-line comment + -- 0 -> "hi" + -- 1 -> "bye" + -- + -- After the `cases` token, the state will be opening = Just "cases", + -- meaning the parser is searching for the next non-whitespace/comment + -- character to determine the leftmost column of the `cases` block. + -- That will be the column of the `0`. + Just blockname -> + -- special case - handling of empty blocks, as in: + -- foo = + -- bar = 42 + if blockname == "=" && column start <= top l && not (null l) + then do + S.put (env {layout = (blockname, column start + 1) : l, opening = Nothing}) + pops start + else [] <$ S.put (env {layout = layout', opening = Nothing}) + where + layout' = (blockname, column start) : l + l = layout env + -- If we're not opening a block, we potentially pop from + -- the layout stack and/or emit virtual semicolons. + Nothing -> if inLayout env then pops start else pure [] + beforeTokenPos <- posP + a <- p <|> (S.put env >> fail "resetting state") + endPos <- posP + pure $ layoutToks ++ tok a beforeTokenPos endPos + where + pops :: Pos -> P [Token Lexeme] + pops p = do + env <- S.get + let l = layout env + if top l == column p && topContainsVirtualSemis l + then pure [Token (Semi True) p p] + else + if column p > top l || topHasClosePair l + then pure [] + else + if column p < top l + then S.put (env {layout = pop l}) >> ((Token Close p p :) <$> pops p) + else error "impossible" + + -- don't emit virtual semis in (, {, or [ blocks + topContainsVirtualSemis :: Layout -> Bool + topContainsVirtualSemis = \case + [] -> False + ((name, _) : _) -> name /= "(" && name /= "{" && name /= "[" + + topHasClosePair :: Layout -> Bool + topHasClosePair [] = False + topHasClosePair ((name, _) : _) = + name `elem` ["DUMMY", "{", "(", "[", "handle", "match", "if", "then"] + +showErrorFancy :: (P.ShowErrorComponent e) => P.ErrorFancy e -> String +showErrorFancy = \case + P.ErrorFail msg -> msg + P.ErrorIndentation ord ref actual -> + "incorrect indentation (got " + <> show (P.unPos actual) + <> ", should be " + <> p + <> show (P.unPos ref) + <> ")" + where + p = case ord of + LT -> "less than " + EQ -> "equal to " + GT -> "greater than " + P.ErrorCustom a -> P.showErrorComponent a + +lexer :: String -> String -> [Token Lexeme] +lexer scope rem = + case flip S.evalState env0 $ P.runParserT (lexemes eof) scope rem of + Left e -> + let errsWithSourcePos = + fst $ + P.attachSourcePos + P.errorOffset + (toList (P.bundleErrors e)) + (P.bundlePosState e) + errorToTokens :: (EP.ParseError String (Token Err), P.SourcePos) -> [Token Lexeme] + errorToTokens (err, top) = case err of + P.FancyError _ (customErrs -> es) | not (null es) -> es + P.FancyError _errOffset es -> + let msg = intercalateMap "\n" showErrorFancy es + in [Token (Err (UnexpectedTokens msg)) (toPos top) (toPos top)] + P.TrivialError _errOffset mayUnexpectedTokens expectedTokens -> + let unexpectedStr :: Set String + unexpectedStr = + mayUnexpectedTokens + & fmap errorItemToString + & maybeToList + & Set.fromList + errorLength :: Int + errorLength = case Set.toList unexpectedStr of + [] -> 0 + (x : _) -> length x + expectedStr :: Set String + expectedStr = + expectedTokens + & Set.map errorItemToString + err = UnexpectedTokens $ formatTrivialError unexpectedStr expectedStr + startPos = toPos top + -- This is just an attempt to highlight errors better in source excerpts. + -- It may not work in all cases, but should generally provide a better experience. + endPos = startPos & \(Pos l c) -> Pos l (c + errorLength) + in [Token (Err err) startPos endPos] + in errsWithSourcePos >>= errorToTokens + Right ts -> postLex $ Token (Open scope) topLeftCorner topLeftCorner : ts + where + eof :: P [Token Lexeme] + eof = P.try do + p <- P.eof >> posP + n <- maybe 0 (const 1) <$> S.gets opening + l <- S.gets layout + pure $ replicate (length l + n) (Token Close p p) + errorItemToString :: EP.ErrorItem Char -> String + errorItemToString = \case + (P.Tokens ts) -> Foldable.toList ts + (P.Label ts) -> Foldable.toList ts + (P.EndOfInput) -> "end of input" + customErrs es = [Err <$> e | P.ErrorCustom e <- toList es] + toPos (P.SourcePos _ line col) = Pos (P.unPos line) (P.unPos col) + env0 = ParsingEnv [] (Just scope) True [0] 0 + +-- | hacky postprocessing pass to do some cleanup of stuff that's annoying to +-- fix without adding more state to the lexer: +-- - 1+1 lexes as [1, +1], convert this to [1, +, 1] +-- - when a semi followed by a virtual semi, drop the virtual, lets you +-- write +-- foo x = action1; +-- 2 +-- - semi immediately after first Open is ignored +tweak :: (Token Lexeme) -> [Token Lexeme] -> [Token Lexeme] +tweak h@(Token (Semi False) _ _) (Token (Semi True) _ _ : t) = h : t +-- __NB__: This case only exists to guard against the following one +tweak h@(Token (Reserved _) _ _) t = h : t +tweak t1 (t2@(Token (Numeric num) _ _) : rem) + | notLayout t1 && touches t1 t2 && isSigned num = + t1 + : Token + (SymbolyId (HQ'.fromName (Name.unsafeParseText (Text.pack (take 1 num))))) + (start t2) + (inc $ start t2) + : Token (Numeric (drop 1 num)) (inc $ start t2) (end t2) + : rem + where + isSigned num = all (\ch -> ch == '-' || ch == '+') $ take 1 num +tweak h t = h : t + +formatTrivialError :: Set String -> Set String -> [Char] +formatTrivialError unexpectedTokens expectedTokens = + let unexpectedMsg = case Set.toList unexpectedTokens of + [] -> "I found something I didn't expect." + [x] -> + let article = case x of + (c : _) | c `elem` ("aeiou" :: String) -> "an" + _ -> "a" + in "I was surprised to find " <> article <> " " <> x <> " here." + xs -> "I was surprised to find these:\n\n* " <> List.intercalate "\n* " xs + expectedMsg = case Set.toList expectedTokens of + [] -> Nothing + xs -> Just $ "\nI was expecting one of these instead:\n\n* " <> List.intercalate "\n* " xs + in concat $ catMaybes [Just unexpectedMsg, expectedMsg] + +displayLexeme :: Lexeme -> String +displayLexeme = \case + Open o -> o + Semi True -> "end of stanza" + Semi False -> "semicolon" + Close -> "end of section" + Reserved r -> "'" <> r <> "'" + Textual t -> "\"" <> t <> "\"" + Character c -> "?" <> [c] + WordyId hq -> Text.unpack (HQ'.toTextWith Name.toText hq) + SymbolyId hq -> Text.unpack (HQ'.toTextWith Name.toText hq) + Blank b -> b + Numeric n -> n + Bytes _b -> "bytes literal" + Hash h -> Text.unpack (SH.toText h) + Err e -> show e + Doc _ -> "doc structure" + +-- | The `Doc` lexer as documented on unison-lang.org +doc2 :: P [Token Lexeme] +doc2 = do + -- Ensure we're at a doc before we start consuming tokens + P.lookAhead (lit "{{") + openStart <- posP + -- Produce any layout tokens, such as closing the last open block or virtual semicolons + -- We don't use 'token' on "{{" directly because we don't want to duplicate layout + -- tokens if we do the rewrite hack for type-docs below. + beforeStartToks <- token' ignore (pure ()) + void $ lit "{{" + openEnd <- posP + CP.space + env0 <- S.get + -- Disable layout while parsing the doc block and reset the section number + (docTok, closeTok) <- local + ( \env -> + env + { inLayout = False, + parentSections = 0 : (parentSections env0) + } + ) + do + body <- Doc.untitledSection lexemes' . P.lookAhead $ () <$ lit "}}" + closeStart <- posP + lit "}}" + closeEnd <- posP + pure (Token (Doc body) openStart closeEnd, Token Close closeStart closeEnd) + -- Parse any layout tokens after the doc block, e.g. virtual semicolon + endToks <- token' ignore (pure ()) + -- Hack to allow anonymous doc blocks before type decls + -- {{ Some docs }} Foo.doc = {{ Some docs }} + -- ability Foo where => ability Foo where + -- + -- __FIXME__: This should be done _after_ parsing, not in lexing. + tn <- subsequentTypeName + pure $ + beforeStartToks <> case (tn) of + -- If we're followed by a type, we rewrite the doc block to be a named doc block. + Just (WordyId tname) + | isTopLevel -> + Token (WordyId (HQ'.fromName (Name.snoc (HQ'.toName tname) NameSegment.docSegment))) openStart openEnd + : Token (Open "=") openStart openEnd + : docTok + -- We need an extra 'Close' here because we added an extra Open above. + : closeTok + : endToks + where + isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 + _ -> docTok : endToks + where + -- DUPLICATED + wordyKw kw = separated wordySep (lit kw) + subsequentTypeName = P.lookAhead . P.optional $ do + let lit' s = lit s <* sp + let modifier = typeModifiersAlt (lit' . Text.unpack) + let typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) + _ <- optional modifier *> typeOrAbility' *> sp + Token name start stop <- tokenP identifierP + if Name.isSymboly (HQ'.toName name) + then P.customFailure (Token (InvalidSymbolyId (Text.unpack (HQ'.toTextWith Name.toText name))) start stop) + else pure (WordyId name) + ignore _ _ _ = [] + -- DUPLICATED + sp = P.try $ do + spaces <- P.takeWhile1P (Just "space") isSpace + close <- P.optional (P.lookAhead (lit "}}")) + case close of + Nothing -> guard $ ok spaces + Just _ -> pure () + pure spaces + where + ok s = length [() | '\n' <- s] < 2 + +lexemes' :: P () -> P [Token Lexeme] +lexemes' eof = + -- NB: `postLex` requires the token stream to start with an `Open`, otherwise it can’t create a `T`, so this adds one, + -- runs `postLex`, then removes it. + fmap (tail . postLex . (Token (Open "fake") mempty mempty :)) $ + local (\env -> env {inLayout = True, opening = Just "DUMMY"}) do + p <- lexemes $ [] <$ eof + -- deals with a final "unclosed" block at the end of `p`) + unclosed <- takeWhile (("DUMMY" /=) . fst) . layout <$> S.get + let pos = end $ last p + pure $ p <> replicate (length unclosed) (Token Close pos pos) + +-- | Consumes an entire Unison “module”. +lexemes :: P [Token Lexeme] -> P [Token Lexeme] +lexemes eof = + P.optional space >> do + hd <- join <$> P.manyTill toks (P.lookAhead eof) + tl <- eof + pure $ hd <> tl + where + toks :: P [Token Lexeme] + toks = + doc2 + <|> doc + <|> token numeric + <|> token character + <|> reserved + <|> token blank + <|> token identifierLexemeP + <|> (asum . map token) [semi, textual, hash] + + doc :: P [Token Lexeme] + doc = open <+> (CP.space *> fmap fixup body) <+> (close <* space) + where + open = token'' (\t _ _ -> t) $ tok (Open <$> lit "[:") + close = tok (Close <$ lit ":]") + at = lit "@" + -- this removes some trailing whitespace from final textual segment + fixup [] = [] + fixup (Token (Textual (reverse -> txt)) start stop : []) = + [Token (Textual txt') start stop] + where + txt' = reverse (dropWhile (\c -> isSpace c && not (c == '\n')) txt) + fixup (h : t) = h : fixup t + + body :: P [Token Lexeme] + body = txt <+> (atk <|> pure []) + where + ch = (":]" <$ lit "\\:]") <|> ("@" <$ lit "\\@") <|> (pure <$> P.anySingle) + txt = tok (Textual . join <$> P.manyTill ch (P.lookAhead sep)) + sep = void at <|> void close + ref = at *> (tok identifierLexemeP <|> docTyp) + atk = (ref <|> docTyp) <+> body + docTyp = do + _ <- lit "[" + typ <- tok (P.manyTill P.anySingle (P.lookAhead (lit "]"))) + _ <- lit "]" *> CP.space + t <- tok identifierLexemeP + pure $ (fmap Reserved <$> typ) <> t + + blank = + separated wordySep do + _ <- char '_' + seg <- P.optional wordyIdSegP + pure (Blank (maybe "" (Text.unpack . NameSegment.toUnescapedText) seg)) + + semi = char ';' $> Semi False + textual = Textual <$> quoted + quoted = quotedRaw <|> quotedSingleLine + quotedRaw = do + _ <- lit "\"\"\"" + n <- many (char '"') + _ <- optional (char '\n') -- initial newline is skipped + s <- P.manyTill P.anySingle (lit (replicate (length n + 3) '"')) + col0 <- column <$> posP + let col = col0 - (length n) - 3 -- this gets us first col of closing quotes + let leading = replicate (max 0 (col - 1)) ' ' + -- a last line that's equal to `leading` is ignored, since leading + -- spaces up to `col` are not considered part of the string + let tweak l = case reverse l of + last : rest + | col > 1 && last == leading -> reverse rest + | otherwise -> l + [] -> [] + pure $ case tweak (lines s) of + [] -> s + ls + | all (\l -> List.isPrefixOf leading l || all isSpace l) ls -> List.intercalate "\n" (drop (length leading) <$> ls) + | otherwise -> s + quotedSingleLine = char '"' *> P.manyTill (LP.charLiteral <|> sp) (char '"') + where + sp = lit "\\s" $> ' ' + character = Character <$> (char '?' *> (spEsc <|> LP.charLiteral)) + where + spEsc = P.try (char '\\' *> char 's' $> ' ') + + numeric = bytes <|> otherbase <|> float <|> intOrNat + where + intOrNat = P.try $ num <$> sign <*> LP.decimal + float = do + _ <- P.try (P.lookAhead (sign >> (LP.decimal :: P Int) >> (char '.' <|> char 'e' <|> char 'E'))) -- commit after this + start <- posP + sign <- fromMaybe "" <$> sign + base <- P.takeWhile1P (Just "base") isDigit + decimals <- + P.optional $ + let missingFractional = err start (MissingFractional $ base <> ".") + in liftA2 (<>) (lit ".") (P.takeWhile1P (Just "decimals") isDigit <|> missingFractional) + exp <- P.optional $ do + e <- map toLower <$> (lit "e" <|> lit "E") + sign <- fromMaybe "" <$> optional (lit "+" <|> lit "-") + let missingExp = err start (MissingExponent $ base <> fromMaybe "" decimals <> e <> sign) + exp <- P.takeWhile1P (Just "exponent") isDigit <|> missingExp + pure $ e <> sign <> exp + pure $ Numeric (sign <> base <> fromMaybe "" decimals <> fromMaybe "" exp) + + bytes = do + start <- posP + _ <- lit "0xs" + s <- map toLower <$> P.takeWhileP (Just "hexidecimal character") isAlphaNum + case Bytes.fromBase16 $ Bytes.fromWord8s (fromIntegral . ord <$> s) of + Left _ -> err start (InvalidBytesLiteral $ "0xs" <> s) + Right bs -> pure (Bytes bs) + otherbase = octal <|> hex + octal = do + start <- posP + commitAfter2 sign (lit "0o") $ \sign _ -> + fmap (num sign) LP.octal <|> err start InvalidOctalLiteral + hex = do + start <- posP + commitAfter2 sign (lit "0x") $ \sign _ -> + fmap (num sign) LP.hexadecimal <|> err start InvalidHexLiteral + + num :: Maybe String -> Integer -> Lexeme + num sign n = Numeric (fromMaybe "" sign <> show n) + sign = P.optional (lit "+" <|> lit "-") + + hash = Hash <$> P.try shortHashP + + reserved :: P [Token Lexeme] + reserved = + token' (\ts _ _ -> ts) $ + braces + <|> parens + <|> brackets + <|> commaSeparator + <|> delim + <|> delayOrForce + <|> keywords + <|> layoutKeywords + where + keywords = + -- yes "wordy" - just like a wordy keyword like "true", the literal "." (as in the dot in + -- "forall a. a -> a") is considered the keyword "." so long as it is either followed by EOF, a space, or some + -- non-wordy character (because ".foo" is a single identifier lexeme) + wordyKw "." + <|> symbolyKw ":" + <|> openKw "@rewrite" + <|> symbolyKw "@" + <|> symbolyKw "||" + <|> symbolyKw "|" + <|> symbolyKw "&&" + <|> wordyKw "true" + <|> wordyKw "false" + <|> wordyKw "use" + <|> wordyKw "forall" + <|> wordyKw "∀" + <|> wordyKw "termLink" + <|> wordyKw "typeLink" + + wordyKw s = separated wordySep (kw s) + symbolyKw s = separated (not . symbolyIdChar) (kw s) + + kw :: String -> P [Token Lexeme] + kw s = tokenP (lit s) <&> \token -> [Reserved <$> token] + + layoutKeywords :: P [Token Lexeme] + layoutKeywords = + ifElse + <|> withKw + <|> openKw "match" + <|> openKw "handle" + <|> typ + <|> arr + <|> rewriteArr + <|> eq + <|> openKw "cases" + <|> openKw "where" + <|> openKw "let" + <|> openKw "do" + where + ifElse = + openKw "if" + <|> closeKw' (Just "then") ["if"] (lit "then") + <|> closeKw' (Just "else") ["then"] (lit "else") + modKw = typeModifiersAlt (openKw1 wordySep . Text.unpack) + typeOrAbilityKw = typeOrAbilityAlt (openTypeKw1 . Text.unpack) + typ = modKw <|> typeOrAbilityKw + + withKw = do + [Token _ pos1 pos2] <- wordyKw "with" + env <- S.get + let l = layout env + case findClose ["handle", "match"] l of + Nothing -> err pos1 (CloseWithoutMatchingOpen msgOpen "'with'") + where + msgOpen = "'handle' or 'match'" + Just (withBlock, n) -> do + let b = withBlock <> "-with" + S.put (env {layout = drop n l, opening = Just b}) + let opens = [Token (Open "with") pos1 pos2] + pure $ replicate n (Token Close pos1 pos2) ++ opens + + -- In `structural/unique type` and `structural/unique ability`, + -- only the `structural` or `unique` opens a layout block, + -- and `ability` and `type` are just keywords. + openTypeKw1 t = do + b <- S.gets (topBlockName . layout) + case b of + Just mod | Set.member (Text.pack mod) typeModifiers -> wordyKw t + _ -> openKw1 wordySep t + + -- layout keyword which bumps the layout column by 1, rather than looking ahead + -- to the next token to determine the layout column + openKw1 :: (Char -> Bool) -> String -> P [Token Lexeme] + openKw1 sep kw = do + Token kw pos0 pos1 <- tokenP $ separated sep (lit kw) + S.modify (\env -> env {layout = (kw, column $ inc pos0) : layout env}) + pure [Token (Open kw) pos0 pos1] + + eq = do + [Token _ start end] <- symbolyKw "=" + env <- S.get + case topBlockName (layout env) of + -- '=' does not open a layout block if within a type declaration + Just t | t == "type" || Set.member (Text.pack t) typeModifiers -> pure [Token (Reserved "=") start end] + Just _ -> S.put (env {opening = Just "="}) >> pure [Token (Open "=") start end] + _ -> err start LayoutError + + rewriteArr = do + [Token _ start end] <- symbolyKw "==>" + env <- S.get + S.put (env {opening = Just "==>"}) >> pure [Token (Open "==>") start end] + + arr = do + [Token _ start end] <- symbolyKw "->" + env <- S.get + -- -> introduces a layout block if we're inside a `match with` or `cases` + case topBlockName (layout env) of + Just match | match `elem` matchWithBlocks -> do + S.put (env {opening = Just "->"}) + pure [Token (Open "->") start end] + _ -> pure [Token (Reserved "->") start end] + + -- a bit of lookahead here to reserve }} for closing a documentation block + braces = open "{" <|> close ["{"] p + where + p = do + l <- lit "}" + -- if we're within an existing {{ }} block, inLayout will be false + -- so we can actually allow }} to appear in normal code + inLayout <- S.gets inLayout + when (not inLayout) $ void $ P.lookAhead (P.satisfy (/= '}')) + pure l + matchWithBlocks = ["match-with", "cases"] + parens = open "(" <|> close ["("] (lit ")") + brackets = open "[" <|> close ["["] (lit "]") + -- `allowCommaToClose` determines if a comma should close inner blocks. + -- Currently there is a set of blocks where `,` is not treated specially + -- and it just emits a Reserved ",". There are currently only three: + -- `cases`, `match-with`, and `{` + allowCommaToClose match = not $ match `elem` ("{" : matchWithBlocks) + commaSeparator = do + env <- S.get + case topBlockName (layout env) of + Just match + | allowCommaToClose match -> + blockDelimiter ["[", "("] (lit ",") + _ -> fail "this comma is a pattern separator" + + delim = P.try $ do + ch <- P.satisfy (\ch -> ch /= ';' && Set.member ch delimiters) + pos <- posP + pure [Token (Reserved [ch]) pos (inc pos)] + + delayOrForce = separated ok $ do + token <- tokenP $ P.satisfy isDelayOrForce + pure [token <&> \op -> Reserved [op]] + where + ok c = isDelayOrForce c || isSpace c || isAlphaNum c || Set.member c delimiters || c == '\"' + +open :: String -> P [Token Lexeme] +open b = do + token <- tokenP $ lit b + env <- S.get + S.put (env {opening = Just b}) + pure [Open b <$ token] + +openKw :: String -> P [Token Lexeme] +openKw s = separated wordySep $ do + token <- tokenP $ lit s + env <- S.get + S.put (env {opening = Just s}) + pure [Open <$> token] + +tok :: P a -> P [Token a] +tok p = do + token <- tokenP p + pure [token] + +-- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is +-- symboly (comprised of only symbols) or wordy (comprised of only alphanums). +-- +-- Examples: +-- +-- foo +-- .foo.++.doc +-- `.`.`..` (This is a two-segment identifier without a leading dot: "." then "..") +identifierLexemeP :: P Lexeme +identifierLexemeP = identifierLexeme <$> identifierP + +identifierLexeme :: HQ'.HashQualified Name -> Lexeme +identifierLexeme name = + if Name.isSymboly (HQ'.toName name) + then SymbolyId name + else WordyId name + +blockDelimiter :: [String] -> P String -> P [Token Lexeme] +blockDelimiter open closeP = do + Token close pos1 pos2 <- tokenP closeP + env <- S.get + case findClose open (layout env) of + Nothing -> err pos1 (UnexpectedDelimiter (quote close)) + where + quote s = "'" <> s <> "'" + Just (_, n) -> do + S.put (env {layout = drop (n - 1) (layout env)}) + let delims = [Token (Reserved close) pos1 pos2] + pure $ replicate (n - 1) (Token Close pos1 pos2) ++ delims + +close :: [String] -> P String -> P [Token Lexeme] +close = close' Nothing + +closeKw' :: Maybe String -> [String] -> P String -> P [Token Lexeme] +closeKw' reopenBlockname open closeP = close' reopenBlockname open (separated wordySep closeP) + +close' :: Maybe String -> [String] -> P String -> P [Token Lexeme] +close' reopenBlockname open closeP = do + Token close pos1 pos2 <- tokenP closeP + env <- S.get + case findClose open (layout env) of + Nothing -> err pos1 (CloseWithoutMatchingOpen msgOpen (quote close)) + where + msgOpen = List.intercalate " or " (quote <$> open) + quote s = "'" <> s <> "'" + Just (_, n) -> do + S.put (env {layout = drop n (layout env), opening = reopenBlockname}) + let opens = maybe [] (const $ [Token (Open close) pos1 pos2]) reopenBlockname + pure $ replicate n (Token Close pos1 pos2) ++ opens + +findClose :: [String] -> Layout -> Maybe (String, Int) +findClose _ [] = Nothing +findClose s ((h, _) : tl) = if h `elem` s then Just (h, 1) else fmap (1 +) <$> findClose s tl + +notLayout :: Token Lexeme -> Bool +notLayout t = case payload t of + Close -> False + Semi _ -> False + Open _ -> False + _ -> True + +top :: Layout -> Column +top [] = 1 +top ((_, h) : _) = h + +topLeftCorner :: Pos +topLeftCorner = Pos 1 1 + +data BlockTree a + = Block + -- | The token that opens the block + a + -- | “Stanzas” of nested tokens + [[BlockTree a]] + -- | The closing token, if any + (Maybe a) + | Leaf a + deriving (Functor, Foldable, Traversable) + +headToken :: BlockTree a -> a +headToken (Block a _ _) = a +headToken (Leaf a) = a + +instance (Show a) => Show (BlockTree a) where + show (Leaf a) = show a + show (Block open mid close) = + show open + ++ "\n" + ++ indent " " (intercalateMap "\n" (intercalateMap " " show) mid) + ++ "\n" + ++ maybe "" show close + where + indent by s = by ++ (s >>= go by) + go by '\n' = '\n' : by + go _ c = [c] + +reorderTree :: ([[BlockTree a]] -> [[BlockTree a]]) -> BlockTree a -> BlockTree a +reorderTree f (Block open mid close) = Block open (f (fmap (reorderTree f) <$> mid)) close +reorderTree _ l = l + +tree :: [Token Lexeme] -> BlockTree (Token Lexeme) +tree toks = one toks const + where + one (open@(payload -> Open _) : ts) k = many (Block open . stanzas) [] ts k + one (t : ts) k = k (Leaf t) ts + one [] k = k lastErr [] + where + lastErr = Leaf case drop (length toks - 1) toks of + [] -> Token (Err LayoutError) topLeftCorner topLeftCorner + (t : _) -> t {payload = Err LayoutError} + + many open acc [] k = k (open (reverse acc) Nothing) [] + many open acc (t@(payload -> Close) : ts) k = k (open (reverse acc) $ pure t) ts + many open acc ts k = one ts $ \t ts -> many open (t : acc) ts k + +stanzas :: [BlockTree (Token Lexeme)] -> [[BlockTree (Token Lexeme)]] +stanzas = + toList + . foldr + ( \tok (curr :| stanzas) -> case tok of + Leaf (Token (Semi _) _ _) -> [tok] :| curr : stanzas + _ -> (tok : curr) :| stanzas + ) + ([] :| []) + +-- Moves type and ability declarations to the front of the token stream +-- and move `use` statements to the front of each block +reorder :: [[BlockTree (Token Lexeme)]] -> [[BlockTree (Token Lexeme)]] +reorder = foldr fixup [] . sortWith f + where + f [] = 3 :: Int + f (t0 : _) = case payload $ headToken t0 of + Open mod | Set.member (Text.pack mod) typeModifiers -> 1 + Open typOrA | Set.member (Text.pack typOrA) typeOrAbility -> 1 + Reserved "use" -> 0 + _ -> 3 :: Int + -- after reordering can end up with trailing semicolon at the end of + -- a block, which we remove with this pass + fixup stanza [] = case Lens.unsnoc stanza of + Nothing -> [] + -- remove any trailing `Semi` from the last non-empty stanza + Just (init, Leaf (Token (Semi _) _ _)) -> [init] + -- don’t touch other stanzas + Just (_, _) -> [stanza] + fixup stanza tail = stanza : tail + +-- | This turns the lexeme stream into a tree, reordering some lexeme subsequences. +preParse :: [Token Lexeme] -> BlockTree (Token Lexeme) +preParse = reorderTree reorder . tree + +-- | A few transformations that happen between lexing and parsing. +-- +-- All of these things should move out of the lexer, and be applied in the parse. +postLex :: [Token Lexeme] -> [Token Lexeme] +postLex = toList . preParse . foldr tweak [] + +isDelayOrForce :: Char -> Bool +isDelayOrForce op = op == '\'' || op == '!' + +-- Mapping between characters and their escape codes. Use parse/showEscapeChar to convert. +escapeChars :: [(Char, Char)] +escapeChars = + [ ('0', '\0'), + ('a', '\a'), + ('b', '\b'), + ('f', '\f'), + ('n', '\n'), + ('r', '\r'), + ('t', '\t'), + ('v', '\v'), + ('s', ' '), + ('\'', '\''), + ('"', '"'), + ('\\', '\\') + ] + +-- Inverse of parseEscapeChar; map a character to its escaped version: +showEscapeChar :: Char -> Maybe Char +showEscapeChar c = + Map.lookup c (Map.fromList [(x, y) | (y, x) <- escapeChars]) + +debugFilePreParse :: FilePath -> IO () +debugFilePreParse file = putStrLn . debugPreParse . preParse . lexer file . Text.unpack =<< readUtf8 file + +debugPreParse :: BlockTree (Token Lexeme) -> String +debugPreParse (Leaf (Token (Err (UnexpectedTokens msg)) start end)) = + (if start == end then msg1 else msg2) <> ":\n" <> msg + where + msg1 = "Error on line " <> show (line start) <> ", column " <> show (column start) + msg2 = + "Error on line " + <> show (line start) + <> ", column " + <> show (column start) + <> " - line " + <> show (line end) + <> ", column " + <> show (column end) +debugPreParse ts = show $ payload <$> ts + +debugPreParse' :: String -> String +debugPreParse' = debugPreParse . preParse . lexer "debugPreParse" + +instance P.VisualStream [Token Lexeme] where + showTokens _ xs = + join . Nel.toList . S.evalState (traverse go xs) . end $ Nel.head xs + where + go :: Token Lexeme -> S.State Pos String + go tok = do + prev <- S.get + S.put $ end tok + pure $ pad prev (start tok) ++ pretty (payload tok) + pretty (Open s) = s + pretty (Reserved w) = w + pretty (Textual t) = '"' : t ++ ['"'] + pretty (Character c) = + case showEscapeChar c of + Just c -> "?\\" ++ [c] + Nothing -> '?' : [c] + pretty (WordyId n) = Text.unpack (HQ'.toText n) + pretty (SymbolyId n) = Text.unpack (HQ'.toText n) + pretty (Blank s) = "_" ++ s + pretty (Numeric n) = n + pretty (Hash sh) = show sh + pretty (Err e) = show e + pretty (Bytes bs) = "0xs" <> show bs + pretty Close = "" + pretty (Semi True) = "" + pretty (Semi False) = ";" + pretty (Doc d) = show d + pad (Pos line1 col1) (Pos line2 col2) = + if line1 == line2 + then replicate (col2 - col1) ' ' + else replicate (line2 - line1) '\n' ++ replicate col2 ' ' diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 1bee4d08f4..098caab1b6 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -88,8 +88,9 @@ import Unison.Pattern qualified as Pattern import Unison.Prelude import Unison.Reference (Reference) import Unison.Referent (Referent) -import Unison.Syntax.Lexer qualified as L +import Unison.Syntax.Lexer.Unison qualified as L import Unison.Syntax.Name qualified as Name (toVar, unsafeParseText) +import Unison.Syntax.Parser.Doc qualified as Doc import Unison.Syntax.Parser.Doc.Data qualified as Doc import Unison.Term (MatchCase (..)) import Unison.UnisonFile.Error qualified as UF @@ -400,7 +401,7 @@ string = queryToken getString getString (L.Textual s) = Just (Text.pack s) getString _ = Nothing -doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection L.DocTree)) +doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection (Doc.Tree [L.Token L.Lexeme]))) doc = queryToken \case L.Doc d -> pure d _ -> Nothing diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs new file mode 100644 index 0000000000..5ca747f204 --- /dev/null +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -0,0 +1,476 @@ +module Unison.Syntax.Parser.Doc where + +import Control.Comonad.Cofree (Cofree ((:<))) +import Control.Monad.State qualified as S +import Data.Char (isControl, isSpace) +import Data.List qualified as List +import Data.List.Extra qualified as List +import Data.List.NonEmpty (NonEmpty ((:|))) +import Data.List.NonEmpty qualified as NonEmpty +import Data.Text qualified as Text +import Text.Megaparsec qualified as P +import Text.Megaparsec.Char (char) +import Text.Megaparsec.Char qualified as CP +import Text.Megaparsec.Char.Lexer qualified as LP +import Unison.Parser.Ann (Ann, Annotated (..)) +import Unison.Prelude +import Unison.Syntax.Lexer + ( P, + ParsingEnv (..), + column, + identifierP, + line, + lit, + local, + sepBy1', + separated, + some', + someTill', + typeOrAbilityAlt, + wordySep, + (<+>), + ) +import Unison.Syntax.Lexer.Token (Token (Token), posP, tokenP) +import Unison.Syntax.Parser.Doc.Data + +type Tree code = Cofree (Top code) Ann + +-- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that +-- Unison wraps `Doc` literals in `}}`). +untitledSection :: forall code. (Annotated code) => (P () -> P code) -> P () -> P (UntitledSection (Tree code)) +untitledSection code docClose = UntitledSection <$> P.many (sectionElem <* CP.space) + where + wordyKw kw = separated wordySep (lit kw) + sectionElem = section <|> fencedBlock <|> list <|> paragraph + paragraph = wrap' . Paragraph <$> spaced leaf + reserved word = List.isPrefixOf "}}" word || all (== '#') word + + wordy :: P end -> P (Leaf code void) + wordy closing = fmap Word . tokenP . P.try $ do + let end = + P.lookAhead $ + docClose + <|> void (P.satisfy isSpace) + <|> void closing + word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end + guard (not $ reserved word || null word) + pure word + + leafy closing = groupy closing gs + where + gs = + link + <|> externalLink + <|> exampleInline + <|> expr + <|> boldOrItalicOrStrikethrough closing + <|> verbatim + <|> atDoc + <|> wordy closing + + leaf = leafy mzero + + atDoc = src <|> evalInline <|> signature <|> signatureInline + where + comma = lit "," <* CP.space + src = + src' Source "@source" + <|> src' FoldedSource "@foldedSource" + srcElem = + SourceElement + <$> (typeLink <|> termLink) + <*> ( fmap (fromMaybe []) . P.optional $ + (lit "@") *> (CP.space *> annotations) + ) + where + annotation = fmap Left (tokenP identifierP) <|> fmap Right expr <* CP.space + annotations = + P.some (EmbedAnnotation <$> annotation) + src' name atName = fmap name $ do + _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space + s <- sepBy1' srcElem comma + _ <- lit "}" + pure s + signature = fmap Signature $ do + _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space + s <- sepBy1' signatureLink comma + _ <- lit "}" + pure s + signatureInline = fmap SignatureInline $ do + _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space + s <- signatureLink + _ <- lit "}" + pure s + evalInline = fmap EvalInline $ do + _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space + let inlineEvalClose = () <$ lit "}" + s <- code inlineEvalClose + pure s + + typeLink = fmap EmbedTypeLink $ do + _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space + tokenP identifierP <* CP.space + + termLink = + fmap EmbedTermLink $ + tokenP identifierP <* CP.space + + signatureLink = + fmap EmbedSignatureLink $ + tokenP identifierP <* CP.space + + groupy closing p = do + Token p _ _ <- tokenP p + after <- P.optional . P.try $ leafy closing + pure $ case after of + Nothing -> p + Just after -> + Group + . Join + $ p + :| pure after + + verbatim = + P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do + Token originalText start stop <- tokenP do + -- a single backtick followed by a non-backtick is treated as monospaced + let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) + -- also two or more ' followed by that number of closing ' + quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) + P.someTill P.anySingle (lit quotes) + let isMultiLine = line start /= line stop + if isMultiLine + then do + let trimmed = (trimAroundDelimiters originalText) + let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed + -- If it's a multi-line verbatim block we trim any whitespace representing + -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' + pure . Verbatim $ + Word $ + Token txt start stop + else + pure . Code $ + Word $ + Token originalText start stop + + exampleInline = + P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ + fmap Example $ do + n <- P.try $ do + _ <- lit "`" + length <$> P.takeWhile1P (Just "backticks") (== '`') + let end = () <$ lit (replicate (n + 1) '`') + ex <- CP.space *> code end + pure ex + + link = + P.label "link (examples: {type List}, {Nat.+})" $ + fmap Link $ + P.try $ + lit "{" *> (typeLink <|> termLink) <* lit "}" + + expr :: P (Leaf code x) + expr = + fmap Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ + lit "{{" *> code (() <$ lit "}}") + + nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' + nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace + + -- Allows whitespace or a newline, but not more than two newlines in a row. + whitespaceWithoutParagraphBreak :: P () + whitespaceWithoutParagraphBreak = void do + void nonNewlineSpaces + optional newline >>= \case + Just _ -> void nonNewlineSpaces + Nothing -> pure () + + fencedBlock = + P.label "block eval (syntax: a fenced code block)" $ + evalUnison <|> exampleBlock <|> other + where + evalUnison = fmap (wrap' . Eval) $ do + -- commit after seeing that ``` is on its own line + fence <- P.try $ do + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) + fence <$ guard b + CP.space + *> code (() <$ lit fence) + + exampleBlock = fmap (wrap' . ExampleBlock) $ do + void $ lit "@typecheck" <* CP.space + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + code (() <$ lit fence) + + uncolumn column tabWidth s = + let skip col r | col < 1 = r + skip col s@('\t' : _) | col < tabWidth = s + skip col ('\t' : r) = skip (col - tabWidth) r + skip col (c : r) + | isSpace c && (not $ isControl c) = + skip (col - 1) r + skip _ s = s + in List.intercalate "\n" $ skip column <$> lines s + + other = fmap (uncurry $ wrapSimple2 CodeBlock) $ do + column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel + let tabWidth = toInteger . P.unPos $ P.defaultTabWidth + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + name <- + P.takeWhileP Nothing nonNewlineSpace + *> tokenP (P.takeWhile1P Nothing (not . isSpace)) + <* P.takeWhileP Nothing nonNewlineSpace + _ <- void CP.eol + verbatim <- + tokenP $ + uncolumn column tabWidth . trimAroundDelimiters + <$> P.someTill P.anySingle ([] <$ lit fence) + pure (name, verbatim) + + boldOrItalicOrStrikethrough closing = do + let start = + some (P.satisfy (== '*')) + <|> some (P.satisfy (== '_')) + <|> some + (P.satisfy (== '~')) + name s = + if take 1 s == "~" + then Strikethrough + else if take 1 s == "*" then Bold else Italic + end <- P.try $ do + end <- start + P.lookAhead (P.satisfy (not . isSpace)) + pure end + name end . wrap' . Paragraph + <$> someTill' + (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) + (lit end) + + externalLink = + P.label "hyperlink (example: [link name](https://destination.com))" $ + fmap (uncurry NamedLink) $ do + _ <- lit "[" + p <- leafies (void $ char ']') + _ <- lit "]" + _ <- lit "(" + target <- + fmap (Group . Join) $ + fmap pure link <|> some' (expr <|> wordy (char ')')) + _ <- lit ")" + pure (p, target) + + -- newline = P.optional (lit "\r") *> lit "\n" + + sp = P.try $ do + spaces <- P.takeWhile1P (Just "space") isSpace + close <- P.optional (P.lookAhead (lit "}}")) + case close of + Nothing -> guard $ ok spaces + Just _ -> pure () + pure spaces + where + ok s = length [() | '\n' <- s] < 2 + + spaced p = some' (p <* P.optional sp) + leafies close = wrap' . Paragraph <$> spaced (leafy close) + + list = bulletedList <|> numberedList + + bulletedList = wrap' . BulletedList <$> sepBy1' bullet listSep + numberedList = wrap' . NumberedList <$> sepBy1' numberedItem listSep + + listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) + + bulletedStart = P.try $ do + r <- listItemStart' $ [] <$ P.satisfy bulletChar + P.lookAhead (P.satisfy isSpace) + pure r + where + bulletChar ch = ch == '*' || ch == '-' || ch == '+' + + listItemStart' :: P a -> P (Int, a) + listItemStart' gutter = P.try $ do + nonNewlineSpaces + col <- column <$> posP + parentCol <- S.gets parentListColumn + guard (col > parentCol) + (col,) <$> gutter + + numberedStart = + listItemStart' $ P.try (tokenP $ LP.decimal <* lit ".") + + listItemParagraph = fmap (wrap' . Paragraph) $ do + col <- column <$> posP + some' (leaf <* sep col) + where + -- Trickiness here to support hard line breaks inside of + -- a bulleted list, so for instance this parses as expected: + -- + -- * uno dos + -- tres quatro + -- * alice bob + -- carol dave eve + sep col = do + _ <- nonNewlineSpaces + _ <- + P.optional . P.try $ + newline + *> nonNewlineSpaces + *> do + col2 <- column <$> posP + guard $ col2 >= col + (P.notFollowedBy $ void numberedStart <|> void bulletedStart) + pure () + + numberedItem = P.label msg $ do + (col, s) <- numberedStart + (s,) + <$> ( fmap (uncurry Column) $ do + p <- nonNewlineSpaces *> listItemParagraph + subList <- + local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) + pure (p, subList) + ) + where + msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" + + bullet = fmap (uncurry Column) . P.label "bullet (examples: * item1, - item2)" $ do + (col, _) <- bulletedStart + p <- nonNewlineSpaces *> listItemParagraph + subList <- + local + (\e -> e {parentListColumn = col}) + (P.optional $ listSep *> list) + pure (p, subList) + + newline = P.label "newline" $ lit "\n" <|> lit "\r\n" + + -- ## Section title + -- + -- A paragraph under this section. + -- Part of the same paragraph. Blanklines separate paragraphs. + -- + -- ### A subsection title + -- + -- A paragraph under this subsection. + + -- # A section title (not a subsection) + section :: P (Tree code) + section = fmap (wrap' . uncurry Section) $ do + ns <- S.gets parentSections + hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp + title <- paragraph <* CP.space + let m = length hashes + head ns + body <- + local (\env -> env {parentSections = (m : (tail ns))}) $ + P.many (sectionElem <* CP.space) + pure $ (title, body) + + wrap' :: Top code (Tree code) -> Tree code + wrap' doc = ann doc :< doc + + wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> Top code (Tree code)) -> a -> b -> Tree code + wrapSimple2 fn a b = ann a <> ann b :< fn a b + +-- | If it's a multi-line verbatim block we trim any whitespace representing +-- indentation from the pretty-printer. +-- +-- E.g. +-- +-- @@ +-- {{ +-- # Heading +-- ''' +-- code +-- indented +-- ''' +-- }} +-- @@ +-- +-- Should lex to the text literal "code\n indented". +-- +-- If there's text in the literal that has LESS trailing whitespace than the +-- opening delimiters, we don't trim it at all. E.g. +-- +-- @@ +-- {{ +-- # Heading +-- ''' +-- code +-- ''' +-- }} +-- @@ +-- +-- Is parsed as " code". +-- +-- Trim the expected amount of whitespace from a text literal: +-- >>> trimIndentFromVerbatimBlock 2 " code\n indented" +-- "code\n indented" +-- +-- If the text literal has less leading whitespace than the opening delimiters, +-- leave it as-is +-- >>> trimIndentFromVerbatimBlock 2 "code\n indented" +-- "code\n indented" +trimIndentFromVerbatimBlock :: Int -> String -> String +trimIndentFromVerbatimBlock leadingSpaces txt = fromMaybe txt $ do + List.intercalate "\n" <$> for (lines txt) \line -> do + -- If any 'stripPrefix' fails, we fail and return the unaltered text + case List.stripPrefix (replicate leadingSpaces ' ') line of + Just stripped -> Just stripped + Nothing -> + -- If it was a line with all white-space, just use an empty line, + -- this can happen easily in editors which trim trailing whitespace. + if all isSpace line + then Just "" + else Nothing + +-- Trim leading/trailing whitespace from around delimiters, e.g. +-- +-- {{ +-- '''___ <- whitespace here including newline +-- text block +-- 👇 or here +-- __''' +-- }} +-- >>> trimAroundDelimiters " \n text block \n " +-- " text block " +-- +-- Should leave leading and trailing line untouched if it contains non-whitespace, e.g.: +-- +-- ''' leading whitespace +-- text block +-- trailing whitespace: ''' +-- >>> trimAroundDelimiters " leading whitespace\n text block \ntrailing whitespace: " +-- " leading whitespace\n text block \ntrailing whitespace: " +-- +-- Should keep trailing newline if it's the only thing on the line, e.g.: +-- +-- ''' +-- newline below +-- +-- ''' +-- >>> trimAroundDelimiters "\nnewline below\n\n" +-- "newline below\n\n" +trimAroundDelimiters :: String -> String +trimAroundDelimiters txt = + txt + & ( \s -> + List.breakOn "\n" s + & \case + (prefix, suffix) + | all isSpace prefix -> drop 1 suffix + | otherwise -> prefix <> suffix + ) + & ( \s -> + List.breakOnEnd "\n" s + & \case + (_prefix, "") -> s + (prefix, suffix) + | all isSpace suffix -> dropTrailingNewline prefix + | otherwise -> prefix <> suffix + ) + where + dropTrailingNewline = \case + [] -> [] + (x : xs) -> NonEmpty.init (x NonEmpty.:| xs) diff --git a/unison-syntax/test/Main.hs b/unison-syntax/test/Main.hs index 5c13940b0a..b7235f299b 100644 --- a/unison-syntax/test/Main.hs +++ b/unison-syntax/test/Main.hs @@ -10,7 +10,7 @@ import Unison.Prelude import Unison.ShortHash (ShortHash) import Unison.ShortHash qualified as ShortHash import Unison.Syntax.HashQualifiedPrime qualified as HQ' (unsafeParseText) -import Unison.Syntax.Lexer +import Unison.Syntax.Lexer.Unison main :: IO () main = diff --git a/unison-syntax/unison-syntax.cabal b/unison-syntax/unison-syntax.cabal index 853da4c817..0da37d0036 100644 --- a/unison-syntax/unison-syntax.cabal +++ b/unison-syntax/unison-syntax.cabal @@ -23,9 +23,11 @@ library Unison.Syntax.HashQualifiedPrime Unison.Syntax.Lexer Unison.Syntax.Lexer.Token + Unison.Syntax.Lexer.Unison Unison.Syntax.Name Unison.Syntax.NameSegment Unison.Syntax.Parser + Unison.Syntax.Parser.Doc Unison.Syntax.Parser.Doc.Data Unison.Syntax.ReservedWords Unison.Syntax.ShortHash From e9512a69ce03e137758f8adc81dae04c422260d9 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Fri, 26 Jul 2024 12:07:05 -0600 Subject: [PATCH 17/22] Split the Doc parser into multiple functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In general, they map to the constructors of the Doc types, with some wiggle room for now. It’s probably beneficial to review this commit by ignoring whitespace. --- .../src/Unison/Syntax/Lexer/Unison.hs | 2 +- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 672 ++++++++++-------- 2 files changed, 375 insertions(+), 299 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs index dcaf9ca6d3..86e75b1afe 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs @@ -313,7 +313,7 @@ doc2 = do } ) do - body <- Doc.untitledSection lexemes' . P.lookAhead $ () <$ lit "}}" + body <- Doc.untitledSection . Doc.sectionElem lexemes' . P.lookAhead $ () <$ lit "}}" closeStart <- posP lit "}}" closeEnd <- posP diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs index 5ca747f204..99122bd5ff 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -1,4 +1,44 @@ -module Unison.Syntax.Parser.Doc where +module Unison.Syntax.Parser.Doc + ( Tree, + untitledSection, + sectionElem, + leaf, + + -- * section elements + section, + eval, + exampleBlock, + codeBlock, + list, + bulletedList, + numberedList, + paragraph, + + -- * leaves + link, + namedLink, + example, + transclude, + bold, + italic, + strikethrough, + verbatim, + source, + foldedSource, + evalInline, + signatures, + signatureInline, + group, + word, + + -- * other components + column', + embedTypeLink, + embedTermLink, + embedSignatureLink, + join, + ) +where import Control.Comonad.Cofree (Cofree ((:<))) import Control.Monad.State qualified as S @@ -13,7 +53,7 @@ import Text.Megaparsec.Char (char) import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Unison.Parser.Ann (Ann, Annotated (..)) -import Unison.Prelude +import Unison.Prelude hiding (join) import Unison.Syntax.Lexer ( P, ParsingEnv (..), @@ -37,146 +77,221 @@ type Tree code = Cofree (Top code) Ann -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). -untitledSection :: forall code. (Annotated code) => (P () -> P code) -> P () -> P (UntitledSection (Tree code)) -untitledSection code docClose = UntitledSection <$> P.many (sectionElem <* CP.space) +untitledSection :: P a -> P (UntitledSection a) +untitledSection a = UntitledSection <$> P.many (a <* CP.space) + +wordyKw :: String -> P String +wordyKw kw = separated wordySep (lit kw) + +sectionElem :: (Annotated code) => (P () -> P code) -> P () -> P (Tree code) +sectionElem code docClose = + fmap wrap' $ + section code docClose + <|> P.label "block eval (syntax: a fenced code block)" (eval code <|> exampleBlock code <|> codeBlock) + <|> list code docClose + <|> paragraph code docClose + +paragraph :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) +paragraph code = fmap Paragraph . spaced . leafy code + +word :: P end -> P (Leaf code void) +word closing = fmap Word . tokenP . P.try $ do + let end = P.lookAhead $ void (P.satisfy isSpace) <|> void closing + word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end + guard (not $ reserved word || null word) + pure word where - wordyKw kw = separated wordySep (lit kw) - sectionElem = section <|> fencedBlock <|> list <|> paragraph - paragraph = wrap' . Paragraph <$> spaced leaf reserved word = List.isPrefixOf "}}" word || all (== '#') word - wordy :: P end -> P (Leaf code void) - wordy closing = fmap Word . tokenP . P.try $ do - let end = - P.lookAhead $ - docClose - <|> void (P.satisfy isSpace) - <|> void closing - word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end - guard (not $ reserved word || null word) - pure word - - leafy closing = groupy closing gs - where - gs = - link - <|> externalLink - <|> exampleInline - <|> expr - <|> boldOrItalicOrStrikethrough closing - <|> verbatim - <|> atDoc - <|> wordy closing - - leaf = leafy mzero - - atDoc = src <|> evalInline <|> signature <|> signatureInline +leaf :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +leaf code closing = + do + link + <|> namedLink code closing + <|> example code + <|> transclude code + <|> bold code closing + <|> italic code closing + <|> strikethrough code closing + <|> verbatim + <|> source code + <|> foldedSource code + <|> evalInline code + <|> signatures + <|> signatureInline + <|> word closing + +leafy :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +leafy code closing = do + p <- leaf code closing + after <- P.optional . P.try $ leafy code closing + case after of + Nothing -> pure p + Just after -> group . pure $ p :| pure after + +comma :: P String +comma = lit "," <* CP.space + +source :: (P () -> P code) -> P (Leaf code a) +source = fmap Source . (lit "@source" *>) . sourceElements + +foldedSource :: (P () -> P code) -> P (Leaf code a) +foldedSource = fmap FoldedSource . (lit "@foldedSource" *>) . sourceElements + +sourceElements :: (P () -> P code) -> P (NonEmpty (SourceElement (Leaf code Void))) +sourceElements code = do + _ <- (lit " {" <|> lit "{") *> CP.space + s <- sepBy1' srcElem comma + _ <- lit "}" + pure s + where + srcElem = + SourceElement + <$> embedLink + <*> ( fmap (fromMaybe []) . P.optional $ + (lit "@") *> (CP.space *> annotations) + ) where - comma = lit "," <* CP.space - src = - src' Source "@source" - <|> src' FoldedSource "@foldedSource" - srcElem = - SourceElement - <$> (typeLink <|> termLink) - <*> ( fmap (fromMaybe []) . P.optional $ - (lit "@") *> (CP.space *> annotations) - ) - where - annotation = fmap Left (tokenP identifierP) <|> fmap Right expr <* CP.space - annotations = - P.some (EmbedAnnotation <$> annotation) - src' name atName = fmap name $ do - _ <- lit atName *> (lit " {" <|> lit "{") *> CP.space - s <- sepBy1' srcElem comma - _ <- lit "}" - pure s - signature = fmap Signature $ do - _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space - s <- sepBy1' signatureLink comma - _ <- lit "}" - pure s - signatureInline = fmap SignatureInline $ do - _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space - s <- signatureLink - _ <- lit "}" - pure s - evalInline = fmap EvalInline $ do - _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space - let inlineEvalClose = () <$ lit "}" - s <- code inlineEvalClose - pure s - - typeLink = fmap EmbedTypeLink $ do - _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tokenP identifierP <* CP.space - - termLink = - fmap EmbedTermLink $ - tokenP identifierP <* CP.space - - signatureLink = - fmap EmbedSignatureLink $ - tokenP identifierP <* CP.space - - groupy closing p = do - Token p _ _ <- tokenP p - after <- P.optional . P.try $ leafy closing - pure $ case after of - Nothing -> p - Just after -> - Group - . Join - $ p - :| pure after - - verbatim = - P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do - Token originalText start stop <- tokenP do - -- a single backtick followed by a non-backtick is treated as monospaced - let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) - -- also two or more ' followed by that number of closing ' - quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) - P.someTill P.anySingle (lit quotes) - let isMultiLine = line start /= line stop - if isMultiLine - then do - let trimmed = (trimAroundDelimiters originalText) - let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed - -- If it's a multi-line verbatim block we trim any whitespace representing - -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - pure . Verbatim $ - Word $ - Token txt start stop - else - pure . Code $ - Word $ - Token originalText start stop - - exampleInline = - P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ - fmap Example $ do - n <- P.try $ do - _ <- lit "`" - length <$> P.takeWhile1P (Just "backticks") (== '`') - let end = () <$ lit (replicate (n + 1) '`') - ex <- CP.space *> code end - pure ex - - link = - P.label "link (examples: {type List}, {Nat.+})" $ - fmap Link $ - P.try $ - lit "{" *> (typeLink <|> termLink) <* lit "}" - - expr :: P (Leaf code x) - expr = - fmap Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ - lit "{{" *> code (() <$ lit "}}") - + annotation = fmap Left (tokenP identifierP) <|> fmap Right (transclude code) <* CP.space + annotations = + P.some (EmbedAnnotation <$> annotation) + +signatures :: P (Leaf code a) +signatures = fmap Signature $ do + _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space + s <- sepBy1' embedSignatureLink comma + _ <- lit "}" + pure s + +signatureInline :: P (Leaf code a) +signatureInline = fmap SignatureInline $ do + _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space + s <- embedSignatureLink + _ <- lit "}" + pure s + +evalInline :: (P () -> P a1) -> P (Leaf a1 a2) +evalInline code = fmap EvalInline $ do + _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space + let inlineEvalClose = void $ lit "}" + s <- code inlineEvalClose + pure s + +embedTypeLink :: P EmbedLink +embedTypeLink = + EmbedTypeLink <$> do + _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space + tokenP identifierP <* CP.space + +embedTermLink :: P EmbedLink +embedTermLink = EmbedTermLink <$> tokenP identifierP <* CP.space + +embedSignatureLink :: P EmbedSignatureLink +embedSignatureLink = EmbedSignatureLink <$> tokenP identifierP <* CP.space + +verbatim :: P (Leaf code a) +verbatim = + P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do + Token originalText start stop <- tokenP do + -- a single backtick followed by a non-backtick is treated as monospaced + let tick = P.try (lit "`" <* P.lookAhead (P.satisfy (/= '`'))) + -- also two or more ' followed by that number of closing ' + quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) + P.someTill P.anySingle (lit quotes) + let isMultiLine = line start /= line stop + if isMultiLine + then do + let trimmed = (trimAroundDelimiters originalText) + let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed + -- If it's a multi-line verbatim block we trim any whitespace representing + -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' + pure . Verbatim $ + Word $ + Token txt start stop + else + pure . Code $ + Word $ + Token originalText start stop + +example :: (P () -> P code) -> P (Leaf code void) +example code = + P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ + fmap Example $ do + n <- P.try $ do + _ <- lit "`" + length <$> P.takeWhile1P (Just "backticks") (== '`') + let end = void . lit $ replicate (n + 1) '`' + CP.space *> code end + +link :: P (Leaf a b) +link = P.label "link (examples: {type List}, {Nat.+})" $ Link <$> P.try (lit "{" *> embedLink <* lit "}") + +transclude :: (P () -> P code) -> P (Leaf code x) +transclude code = + fmap Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ + lit "{{" *> code (void $ lit "}}") + +nonNewlineSpaces :: P String +nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace + where nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' - nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace +eval :: (Annotated code) => (P () -> P code) -> P (Top code (Tree code)) +eval code = + Eval <$> do + -- commit after seeing that ``` is on its own line + fence <- P.try $ do + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) + fence <$ guard b + CP.space *> code (void $ lit fence) + +exampleBlock :: (Annotated code) => (P () -> P code) -> P (Top code (Tree code)) +exampleBlock code = + ExampleBlock + <$> do + void $ lit "@typecheck" <* CP.space + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + code . void $ lit fence + +codeBlock :: P (Top code (Tree code)) +codeBlock = do + column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel + let tabWidth = toInteger . P.unPos $ P.defaultTabWidth + fence <- lit "```" <+> P.takeWhileP Nothing (== '`') + name <- + nonNewlineSpaces + *> tokenP (P.takeWhile1P Nothing (not . isSpace)) + <* nonNewlineSpaces + _ <- void CP.eol + verbatim <- + tokenP $ + uncolumn column tabWidth . trimAroundDelimiters + <$> P.someTill P.anySingle ([] <$ lit fence) + pure $ CodeBlock name verbatim + where + uncolumn column tabWidth s = + let skip col r | col < 1 = r + skip col s@('\t' : _) | col < tabWidth = s + skip col ('\t' : r) = skip (col - tabWidth) r + skip col (c : r) + | isSpace c && (not $ isControl c) = + skip (col - 1) r + skip _ s = s + in List.intercalate "\n" $ skip column <$> lines s + +emphasis :: (Annotated code) => Char -> (P () -> P code) -> P () -> P (Tree code) +emphasis delimiter code closing = do + let start = some (P.satisfy (== delimiter)) + end <- P.try $ do + end <- start + P.lookAhead (P.satisfy (not . isSpace)) + pure end + wrap' . Paragraph + <$> someTill' + (leafy code (closing <|> (void $ lit end)) <* void whitespaceWithoutParagraphBreak) + (lit end) + where -- Allows whitespace or a newline, but not more than two newlines in a row. whitespaceWithoutParagraphBreak :: P () whitespaceWithoutParagraphBreak = void do @@ -185,124 +300,92 @@ untitledSection code docClose = UntitledSection <$> P.many (sectionElem <* CP.sp Just _ -> void nonNewlineSpaces Nothing -> pure () - fencedBlock = - P.label "block eval (syntax: a fenced code block)" $ - evalUnison <|> exampleBlock <|> other - where - evalUnison = fmap (wrap' . Eval) $ do - -- commit after seeing that ``` is on its own line - fence <- P.try $ do - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - b <- all isSpace <$> P.lookAhead (P.takeWhileP Nothing (/= '\n')) - fence <$ guard b - CP.space - *> code (() <$ lit fence) - - exampleBlock = fmap (wrap' . ExampleBlock) $ do - void $ lit "@typecheck" <* CP.space - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - code (() <$ lit fence) - - uncolumn column tabWidth s = - let skip col r | col < 1 = r - skip col s@('\t' : _) | col < tabWidth = s - skip col ('\t' : r) = skip (col - tabWidth) r - skip col (c : r) - | isSpace c && (not $ isControl c) = - skip (col - 1) r - skip _ s = s - in List.intercalate "\n" $ skip column <$> lines s - - other = fmap (uncurry $ wrapSimple2 CodeBlock) $ do - column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel - let tabWidth = toInteger . P.unPos $ P.defaultTabWidth - fence <- lit "```" <+> P.takeWhileP Nothing (== '`') - name <- - P.takeWhileP Nothing nonNewlineSpace - *> tokenP (P.takeWhile1P Nothing (not . isSpace)) - <* P.takeWhileP Nothing nonNewlineSpace - _ <- void CP.eol - verbatim <- - tokenP $ - uncolumn column tabWidth . trimAroundDelimiters - <$> P.someTill P.anySingle ([] <$ lit fence) - pure (name, verbatim) - - boldOrItalicOrStrikethrough closing = do - let start = - some (P.satisfy (== '*')) - <|> some (P.satisfy (== '_')) - <|> some - (P.satisfy (== '~')) - name s = - if take 1 s == "~" - then Strikethrough - else if take 1 s == "*" then Bold else Italic - end <- P.try $ do - end <- start - P.lookAhead (P.satisfy (not . isSpace)) - pure end - name end . wrap' . Paragraph - <$> someTill' - (leafy (closing <|> (void $ lit end)) <* whitespaceWithoutParagraphBreak) - (lit end) - - externalLink = - P.label "hyperlink (example: [link name](https://destination.com))" $ - fmap (uncurry NamedLink) $ do - _ <- lit "[" - p <- leafies (void $ char ']') - _ <- lit "]" - _ <- lit "(" - target <- - fmap (Group . Join) $ - fmap pure link <|> some' (expr <|> wordy (char ')')) - _ <- lit ")" - pure (p, target) - - -- newline = P.optional (lit "\r") *> lit "\n" - - sp = P.try $ do - spaces <- P.takeWhile1P (Just "space") isSpace - close <- P.optional (P.lookAhead (lit "}}")) - case close of - Nothing -> guard $ ok spaces - Just _ -> pure () - pure spaces - where - ok s = length [() | '\n' <- s] < 2 +bold :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +bold code = fmap Bold . emphasis '*' code + +italic :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +italic code = fmap Italic . emphasis '_' code + +strikethrough :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +strikethrough code = fmap Strikethrough . emphasis '~' code + +namedLink :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) +namedLink code docClose = + P.label "hyperlink (example: [link name](https://destination.com))" do + _ <- lit "[" + p <- spaced . leafy code . void $ char ']' + _ <- lit "]" + _ <- lit "(" + target <- group $ fmap pure link <|> some' (transclude code <|> word (docClose <|> void (char ')'))) + _ <- lit ")" + pure $ NamedLink (wrap' $ Paragraph p) target + +sp :: P String +sp = P.try $ do + spaces <- P.takeWhile1P (Just "space") isSpace + close <- P.optional (P.lookAhead (lit "}}")) + case close of + Nothing -> guard $ ok spaces + Just _ -> pure () + pure spaces + where + ok s = length [() | '\n' <- s] < 2 + +spaced :: P a -> P (NonEmpty a) +spaced p = some' (p <* P.optional sp) - spaced p = some' (p <* P.optional sp) - leafies close = wrap' . Paragraph <$> spaced (leafy close) +-- | Not an actual node, but this pattern is referenced in multiple places +list :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) +list code docClose = bulletedList code docClose <|> numberedList code docClose - list = bulletedList <|> numberedList +listSep :: P () +listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) - bulletedList = wrap' . BulletedList <$> sepBy1' bullet listSep - numberedList = wrap' . NumberedList <$> sepBy1' numberedItem listSep +bulletedStart :: P (Int, [a]) +bulletedStart = P.try $ do + r <- listItemStart $ [] <$ P.satisfy bulletChar + P.lookAhead (P.satisfy isSpace) + pure r + where + bulletChar ch = ch == '*' || ch == '-' || ch == '+' + +listItemStart :: P a -> P (Int, a) +listItemStart gutter = P.try $ do + nonNewlineSpaces + col <- column <$> posP + parentCol <- S.gets parentListColumn + guard (col > parentCol) + (col,) <$> gutter + +numberedStart :: P (Int, Token Word64) +numberedStart = listItemStart $ P.try (tokenP $ LP.decimal <* lit ".") + +-- | FIXME: This should take a @`P` a@ +numberedList :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) +numberedList code docClose = NumberedList <$> sepBy1' numberedItem listSep + where + numberedItem = P.label "numbered list (examples: 1. item1, 8. start numbering at '8')" do + (col, s) <- numberedStart + (s,) <$> column' code docClose col - listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) +-- | FIXME: This should take a @`P` a@ +bulletedList :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) +bulletedList code docClose = BulletedList <$> sepBy1' bullet listSep + where + bullet = P.label "bullet (examples: * item1, - item2)" do + (col, _) <- bulletedStart + column' code docClose col - bulletedStart = P.try $ do - r <- listItemStart' $ [] <$ P.satisfy bulletChar - P.lookAhead (P.satisfy isSpace) - pure r - where - bulletChar ch = ch == '*' || ch == '-' || ch == '+' - - listItemStart' :: P a -> P (Int, a) - listItemStart' gutter = P.try $ do - nonNewlineSpaces - col <- column <$> posP - parentCol <- S.gets parentListColumn - guard (col > parentCol) - (col,) <$> gutter - - numberedStart = - listItemStart' $ P.try (tokenP $ LP.decimal <* lit ".") - - listItemParagraph = fmap (wrap' . Paragraph) $ do - col <- column <$> posP - some' (leaf <* sep col) +column' :: (Annotated code) => (P () -> P code) -> P () -> Int -> P (Column (Tree code)) +column' code docClose col = + Column . wrap' + <$> (nonNewlineSpaces *> listItemParagraph) + <*> local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> fmap wrap' (list code docClose)) + where + listItemParagraph = + Paragraph <$> do + col <- column <$> posP + some' (leafy code docClose <* sep col) where -- Trickiness here to support hard line breaks inside of -- a bulleted list, so for instance this parses as expected: @@ -323,55 +406,48 @@ untitledSection code docClose = UntitledSection <$> P.many (sectionElem <* CP.sp (P.notFollowedBy $ void numberedStart <|> void bulletedStart) pure () - numberedItem = P.label msg $ do - (col, s) <- numberedStart - (s,) - <$> ( fmap (uncurry Column) $ do - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> list) - pure (p, subList) - ) - where - msg = "numbered list (examples: 1. item1, 8. start numbering at '8')" +newline :: P String +newline = P.label "newline" $ lit "\n" <|> lit "\r\n" - bullet = fmap (uncurry Column) . P.label "bullet (examples: * item1, - item2)" $ do - (col, _) <- bulletedStart - p <- nonNewlineSpaces *> listItemParagraph - subList <- - local - (\e -> e {parentListColumn = col}) - (P.optional $ listSep *> list) - pure (p, subList) - - newline = P.label "newline" $ lit "\n" <|> lit "\r\n" - - -- ## Section title - -- - -- A paragraph under this section. - -- Part of the same paragraph. Blanklines separate paragraphs. - -- - -- ### A subsection title - -- - -- A paragraph under this subsection. - - -- # A section title (not a subsection) - section :: P (Tree code) - section = fmap (wrap' . uncurry Section) $ do - ns <- S.gets parentSections - hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp - title <- paragraph <* CP.space - let m = length hashes + head ns - body <- - local (\env -> env {parentSections = (m : (tail ns))}) $ - P.many (sectionElem <* CP.space) - pure $ (title, body) - - wrap' :: Top code (Tree code) -> Tree code - wrap' doc = ann doc :< doc - - wrapSimple2 :: (Annotated a, Annotated b) => (a -> b -> Top code (Tree code)) -> a -> b -> Tree code - wrapSimple2 fn a b = ann a <> ann b :< fn a b +-- | +-- +-- > ## Section title +-- > +-- > A paragraph under this section. +-- > Part of the same paragraph. Blanklines separate paragraphs. +-- > +-- > ### A subsection title +-- > +-- > A paragraph under this subsection. +-- > +-- > # A section title (not a subsection) +section :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) +section code docClose = do + ns <- S.gets parentSections + hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp + title <- paragraph code docClose <* CP.space + let m = length hashes + head ns + body <- + local (\env -> env {parentSections = (m : (tail ns))}) $ + P.many (sectionElem code docClose <* CP.space) + pure $ Section (wrap' title) body + +-- | Not an actual node, but this pattern is referenced in multiple places +embedLink :: P EmbedLink +embedLink = embedTypeLink <|> embedTermLink + +-- | FIXME: This should just take a @`P` code@ and @`P` a@. +group :: P (NonEmpty (Leaf code a)) -> P (Leaf code a) +group = fmap Group . join + +-- | FIXME: This should just take a @`P` a@ +join :: P (NonEmpty a) -> P (Join a) +join = fmap Join + +-- * utility functions + +wrap' :: (Annotated code) => Top code (Tree code) -> Tree code +wrap' doc = ann doc :< doc -- | If it's a multi-line verbatim block we trim any whitespace representing -- indentation from the pretty-printer. @@ -425,7 +501,7 @@ trimIndentFromVerbatimBlock leadingSpaces txt = fromMaybe txt $ do then Just "" else Nothing --- Trim leading/trailing whitespace from around delimiters, e.g. +-- | Trim leading/trailing whitespace from around delimiters, e.g. -- -- {{ -- '''___ <- whitespace here including newline From a6528ac351c1b8f7e51363040d2f710e75a4a1ee Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Fri, 26 Jul 2024 22:56:22 -0600 Subject: [PATCH 18/22] Generalize the Doc parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is now completely[^1] independent of the Unison language. The parser takes a few parsers as arguments: one for identifiers, one for code, and one to indicate the end of the Doc block. [^1]: There is one last bit to be removed in the next commit – Doc still looks for `type` or `ability` to identify type links. --- .../src/Unison/Syntax/TermParser.hs | 16 +- unison-syntax/src/Unison/Parser/Ann.hs | 4 + unison-syntax/src/Unison/Syntax/Lexer.hs | 170 +------- .../src/Unison/Syntax/Lexer/Token.hs | 6 +- .../src/Unison/Syntax/Lexer/Unison.hs | 117 +++++- unison-syntax/src/Unison/Syntax/Parser.hs | 2 +- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 364 +++++++++++------- .../src/Unison/Syntax/Parser/Doc/Data.hs | 72 ++-- 8 files changed, 412 insertions(+), 339 deletions(-) diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 4c3069b9ff..8d0195410a 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -530,7 +530,7 @@ doc2Block = do docUntitledSection ann (Doc.UntitledSection tops) = Term.app ann (f ann "UntitledSection") $ Term.list (gann tops) tops - docTop :: Doc.Top [L.Token L.Lexeme] (Term v Ann) -> TermP v m + docTop :: Doc.Top (HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m docTop d = case d of Doc.Section title body -> pure $ Term.apps' (f d "Section") [title, Term.list (gann body) body] Doc.Eval code -> @@ -558,7 +558,7 @@ doc2Block = do docColumn d@(Doc.Column para sublist) = Term.app (gann d) (f d "Column") . Term.list (gann d) $ para : toList sublist - docLeaf :: Doc.Leaf [L.Token L.Lexeme] (Term v Ann) -> TermP v m + docLeaf :: Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m docLeaf d = case d of Doc.Link link -> Term.app (gann d) (f d "Link") <$> docEmbedLink link Doc.NamedLink para target -> Term.apps' (f d "NamedLink") . (para :) . pure <$> docLeaf (vacuous target) @@ -590,7 +590,7 @@ doc2Block = do Term.app (gann d) (f d "Group") . Term.app (gann d) (f d "Join") . Term.list (ann leaves) . toList <$> traverse docLeaf leaves - docEmbedLink :: Doc.EmbedLink -> TermP v m + docEmbedLink :: Doc.EmbedLink (HQ'.HashQualified Name) -> TermP v m docEmbedLink d = case d of Doc.EmbedTypeLink ident -> Term.app (gann d) (f d "EmbedTypeLink") . Term.typeLink (ann d) . L.payload @@ -598,17 +598,21 @@ doc2Block = do Doc.EmbedTermLink ident -> Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docSourceElement :: Doc.SourceElement (Doc.Leaf [L.Token L.Lexeme] Void) -> TermP v m + docSourceElement :: + Doc.SourceElement (HQ'.HashQualified Name) (Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> + TermP v m docSourceElement d@(Doc.SourceElement link anns) = do link' <- docEmbedLink link anns' <- traverse docEmbedAnnotation anns pure $ Term.apps' (f d "SourceElement") [link', Term.list (ann anns) anns'] - docEmbedSignatureLink :: Doc.EmbedSignatureLink -> TermP v m + docEmbedSignatureLink :: Doc.EmbedSignatureLink (HQ'.HashQualified Name) -> TermP v m docEmbedSignatureLink d@(Doc.EmbedSignatureLink ident) = Term.app (gann d) (f d "EmbedSignatureLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) - docEmbedAnnotation :: Doc.EmbedAnnotation (Doc.Leaf [L.Token L.Lexeme] Void) -> TermP v m + docEmbedAnnotation :: + Doc.EmbedAnnotation (HQ'.HashQualified Name) (Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> + TermP v m docEmbedAnnotation d@(Doc.EmbedAnnotation a) = -- This is the only place I’m not sure we’re doing the right thing. In the lexer, this can be an identifier or a -- DocLeaf, but here it could be either /text/ or a Doc element. And I don’t think there’s any way the lexemes diff --git a/unison-syntax/src/Unison/Parser/Ann.hs b/unison-syntax/src/Unison/Parser/Ann.hs index 961bbcb30c..e4b361d148 100644 --- a/unison-syntax/src/Unison/Parser/Ann.hs +++ b/unison-syntax/src/Unison/Parser/Ann.hs @@ -4,6 +4,7 @@ module Unison.Parser.Ann where +import Control.Comonad.Cofree (Cofree ((:<))) import Data.List.NonEmpty (NonEmpty) import Data.Void (absurd) import Unison.Lexer.Pos qualified as L @@ -100,3 +101,6 @@ instance (Annotated a) => Annotated (Maybe a) where instance Annotated Void where ann = absurd + +instance (Annotated a) => Annotated (Cofree f a) where + ann (a :< _) = ann a diff --git a/unison-syntax/src/Unison/Syntax/Lexer.hs b/unison-syntax/src/Unison/Syntax/Lexer.hs index cfd932cd7e..5e6d18293f 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer.hs @@ -1,12 +1,8 @@ -{-# LANGUAGE TemplateHaskell #-} -{-# OPTIONS_GHC -Wno-orphans #-} - -- | This currently contains a mix of general lexing utilities and identifier-y lexers. module Unison.Syntax.Lexer ( Token (..), Line, Column, - Err (..), Pos (..), touches, @@ -15,16 +11,10 @@ module Unison.Syntax.Lexer wordyIdStartChar, symbolyIdChar, - -- * new exports - BlockName, - Layout, - ParsingEnv (..), - P, + -- * other utils local, - parseFailure, space, lit, - err, commitAfter2, (<+>), some', @@ -32,99 +22,35 @@ module Unison.Syntax.Lexer sepBy1', separated, wordySep, - identifierP, - wordyIdSegP, - shortHashP, - topBlockName, pop, typeOrAbilityAlt, - typeModifiersAlt, inc, ) where -import Control.Comonad.Cofree (Cofree ((:<))) import Control.Monad.State qualified as S import Data.Char (isSpace) import Data.List.NonEmpty (NonEmpty ((:|))) -import Data.Text qualified as Text import Text.Megaparsec qualified as P import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP -import Text.Megaparsec.Error qualified as EP -import Text.Megaparsec.Internal qualified as PI -import Unison.HashQualifiedPrime qualified as HQ' import Unison.Lexer.Pos (Column, Line, Pos (Pos), column, line) -import Unison.Name (Name) -import Unison.NameSegment (NameSegment) -import Unison.Parser.Ann (Annotated (..)) import Unison.Prelude -import Unison.ShortHash (ShortHash) -import Unison.Syntax.Lexer.Token (Token (..), posP) -import Unison.Syntax.Name qualified as Name (nameP) +import Unison.Syntax.Lexer.Token (Token (..)) import Unison.Syntax.NameSegment (symbolyIdChar, wordyIdChar, wordyIdStartChar) -import Unison.Syntax.NameSegment qualified as NameSegment (ParseErr (..), wordyP) -import Unison.Syntax.ReservedWords (typeModifiers, typeOrAbility) -import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) - -instance (Annotated a) => Annotated (Cofree f a) where - ann (a :< _) = ann a - -type BlockName = String - -type Layout = [(BlockName, Column)] - -data ParsingEnv = ParsingEnv - { -- | layout stack - layout :: !Layout, - -- | `Just b` if a block of type `b` is being opened - opening :: Maybe BlockName, - -- | are we inside a construct that uses layout? - inLayout :: Bool, - -- | Use a stack to remember the parent section and allow docSections within docSections. - -- - 1 means we are inside a # Heading 1 - parentSections :: [Int], - -- | 4 means we are inside a list starting at the fourth column - parentListColumn :: Int - } - deriving (Show) - -type P = P.ParsecT (Token Err) String (S.State ParsingEnv) - -local :: (ParsingEnv -> ParsingEnv) -> P a -> P a +import Unison.Syntax.ReservedWords (typeOrAbility) + +local :: (P.MonadParsec e s' m, S.MonadState s m) => (s -> s) -> m a -> m a local f p = do env0 <- S.get S.put (f env0) e <- P.observing p S.put env0 case e of - Left e -> parseFailure e + Left e -> P.parseError e Right a -> pure a -parseFailure :: EP.ParseError [Char] (Token Err) -> P a -parseFailure e = PI.ParsecT $ \s _ _ _ eerr -> eerr e s - -data Err - = ReservedWordyId String - | InvalidSymbolyId String - | ReservedSymbolyId String - | InvalidShortHash String - | InvalidBytesLiteral String - | InvalidHexLiteral - | InvalidOctalLiteral - | Both Err Err - | MissingFractional String -- ex `1.` rather than `1.04` - | MissingExponent String -- ex `1e` rather than `1e3` - | UnknownLexeme - | TextLiteralMissingClosingQuote String - | InvalidEscapeCharacter Char - | LayoutError - | CloseWithoutMatchingOpen String String -- open, close - | UnexpectedDelimiter String - | UnexpectedTokens String -- Catch-all for all other lexer errors, representing some unexpected tokens. - deriving stock (Eq, Ord, Show) -- richer algebra - -space :: P () +space :: (P.MonadParsec e String m) => m () space = LP.space CP.space1 @@ -133,92 +59,42 @@ space = where fold = P.try $ lit "---" *> P.takeRest *> pure () -lit :: String -> P String +lit :: (P.MonadParsec e String m) => String -> m String lit = P.try . LP.symbol (pure ()) --- Committed failure -err :: Pos -> Err -> P x -err start t = do - stop <- posP - -- This consumes a character and therefore produces committed failure, - -- so `err s t <|> p2` won't try `p2` - _ <- void P.anySingle <|> P.eof - P.customFailure (Token t start stop) - -{- -commitAfter :: P a -> (a -> P b) -> P b -commitAfter a f = do - a <- P.try a - f a --} - -commitAfter2 :: P a -> P b -> (a -> b -> P c) -> P c +commitAfter2 :: (P.MonadParsec e s m) => m a -> m b -> (a -> b -> m c) -> m c commitAfter2 a b f = do (a, b) <- P.try $ liftA2 (,) a b f a b infixl 2 <+> -(<+>) :: (Monoid a) => P a -> P a -> P a -p1 <+> p2 = do a1 <- p1; a2 <- p2; pure (a1 <> a2) +(<+>) :: (Applicative f, Monoid a) => f a -> f a -> f a +(<+>) = liftA2 (<>) -- | Like `P.some`, but returns an actual `NonEmpty`. -some' :: P a -> P (NonEmpty a) +some' :: (P.MonadParsec e s m) => m a -> m (NonEmpty a) some' p = liftA2 (:|) p $ many p -- | Like `P.someTill`, but returns an actual `NonEmpty`. -someTill' :: P a -> P end -> P (NonEmpty a) +someTill' :: (P.MonadParsec e s m) => m a -> m end -> m (NonEmpty a) someTill' p end = liftA2 (:|) p $ P.manyTill p end -- | Like `P.sepBy1`, but returns an actual `NonEmpty`. -sepBy1' :: P a -> P sep -> P (NonEmpty a) +sepBy1' :: (P.MonadParsec e s m) => m a -> m sep -> m (NonEmpty a) sepBy1' p sep = liftA2 (:|) p . many $ sep *> p -separated :: (Char -> Bool) -> P a -> P a +separated :: (P.MonadParsec e s m) => (P.Token s -> Bool) -> m a -> m a separated ok p = P.try $ p <* P.lookAhead (void (P.satisfy ok) <|> P.eof) wordySep :: Char -> Bool wordySep c = isSpace c || not (wordyIdChar c) --- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is --- symboly (comprised of only symbols) or wordy (comprised of only alphanums). --- --- Examples: --- --- foo --- .foo.++.doc --- `.`.`..` (This is a two-segment identifier without a leading dot: "." then "..") -identifierP :: P (HQ'.HashQualified Name) -identifierP = do - P.label "identifier (ex: abba1, snake_case, .foo.bar#xyz, .foo.++#xyz, or 🌻)" do - name <- PI.withParsecT (fmap nameSegmentParseErrToErr) Name.nameP - P.optional shortHashP <&> \case - Nothing -> HQ'.fromName name - Just shorthash -> HQ'.HashQualified name shorthash - where - nameSegmentParseErrToErr :: NameSegment.ParseErr -> Err - nameSegmentParseErrToErr = \case - NameSegment.ReservedOperator s -> ReservedSymbolyId (Text.unpack s) - NameSegment.ReservedWord s -> ReservedWordyId (Text.unpack s) - -wordyIdSegP :: P NameSegment -wordyIdSegP = - PI.withParsecT (fmap (ReservedWordyId . Text.unpack)) NameSegment.wordyP - -shortHashP :: P ShortHash -shortHashP = - PI.withParsecT (fmap (InvalidShortHash . Text.unpack)) ShortHash.shortHashP - -- `True` if the tokens are adjacent, with no space separating the two touches :: Token a -> Token b -> Bool touches (end -> t) (start -> t2) = line t == line t2 && column t == column t2 --- todo: make Layout a NonEmpty -topBlockName :: Layout -> Maybe BlockName -topBlockName [] = Nothing -topBlockName ((name, _) : _) = Just name - pop :: [a] -> [a] pop = drop 1 @@ -226,21 +102,5 @@ typeOrAbilityAlt :: (Alternative f) => (Text -> f a) -> f a typeOrAbilityAlt f = asum $ map f (toList typeOrAbility) -typeModifiersAlt :: (Alternative f) => (Text -> f a) -> f a -typeModifiersAlt f = - asum $ map f (toList typeModifiers) - inc :: Pos -> Pos inc (Pos line col) = Pos line (col + 1) - -instance EP.ShowErrorComponent (Token Err) where - showErrorComponent (Token err _ _) = go err - where - go = \case - UnexpectedTokens msg -> msg - CloseWithoutMatchingOpen open close -> "I found a closing " <> close <> " but no matching " <> open <> "." - Both e1 e2 -> go e1 <> "\n" <> go e2 - LayoutError -> "Indentation error" - TextLiteralMissingClosingQuote s -> "This text literal missing a closing quote: " <> excerpt s - e -> show e - excerpt s = if length s < 15 then s else take 15 s <> "..." diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Token.hs b/unison-syntax/src/Unison/Syntax/Lexer/Token.hs index e29f276c5e..f778dd66c0 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Token.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Token.hs @@ -6,7 +6,7 @@ module Unison.Syntax.Lexer.Token where import Data.Text qualified as Text -import Text.Megaparsec (ParsecT, TraversableStream) +import Text.Megaparsec (MonadParsec, TraversableStream) import Text.Megaparsec qualified as P import Unison.Lexer.Pos (Pos (Pos)) import Unison.Parser.Ann (Ann (Ann), Annotated (..)) @@ -43,14 +43,14 @@ instance Applicative Token where instance P.ShowErrorComponent (Token Text) where showErrorComponent = Text.unpack . payload -tokenP :: (Ord e, TraversableStream s) => ParsecT e s m a -> ParsecT e s m (Token a) +tokenP :: (Ord e, TraversableStream s, MonadParsec e s m) => m a -> m (Token a) tokenP p = do start <- posP payload <- p end <- posP pure Token {payload, start, end} -posP :: (Ord e, TraversableStream s) => ParsecT e s m Pos +posP :: (Ord e, TraversableStream s, MonadParsec e s m) => m Pos posP = do p <- P.getSourcePos pure (Pos (P.unPos (P.sourceLine p)) (P.unPos (P.sourceColumn p))) diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs index 86e75b1afe..98112c2124 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs @@ -1,6 +1,3 @@ -{-# LANGUAGE TemplateHaskell #-} -{-# OPTIONS_GHC -Wno-orphans #-} - module Unison.Syntax.Lexer.Unison ( Token (..), Line, @@ -44,9 +41,11 @@ import Text.Megaparsec.Char (char) import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Text.Megaparsec.Error qualified as EP +import Text.Megaparsec.Internal qualified as PI import Unison.HashQualifiedPrime qualified as HQ' import Unison.Name (Name) import Unison.Name qualified as Name +import Unison.NameSegment (NameSegment) import Unison.NameSegment qualified as NameSegment (docSegment) import Unison.NameSegment.Internal qualified as NameSegment import Unison.Prelude @@ -55,13 +54,51 @@ import Unison.ShortHash qualified as SH import Unison.Syntax.HashQualifiedPrime qualified as HQ' (toText) import Unison.Syntax.Lexer import Unison.Syntax.Lexer.Token (posP, tokenP) -import Unison.Syntax.Name qualified as Name (isSymboly, toText, unsafeParseText) +import Unison.Syntax.Name qualified as Name (isSymboly, nameP, toText, unsafeParseText) +import Unison.Syntax.NameSegment qualified as NameSegment (ParseErr (..), wordyP) import Unison.Syntax.Parser.Doc qualified as Doc import Unison.Syntax.Parser.Doc.Data qualified as Doc import Unison.Syntax.ReservedWords (delimiters, typeModifiers, typeOrAbility) +import Unison.Syntax.ShortHash qualified as ShortHash (shortHashP) import Unison.Util.Bytes qualified as Bytes import Unison.Util.Monoid (intercalateMap) +type BlockName = String + +type Layout = [(BlockName, Column)] + +data ParsingEnv = ParsingEnv + { -- | layout stack + layout :: !Layout, + -- | `Just b` if a block of type `b` is being opened + opening :: Maybe BlockName, + -- | are we inside a construct that uses layout? + inLayout :: Bool + } + deriving (Show) + +type P = P.ParsecT (Token Err) String (S.State ParsingEnv) + +data Err + = ReservedWordyId String + | InvalidSymbolyId String + | ReservedSymbolyId String + | InvalidShortHash String + | InvalidBytesLiteral String + | InvalidHexLiteral + | InvalidOctalLiteral + | Both Err Err + | MissingFractional String -- ex `1.` rather than `1.04` + | MissingExponent String -- ex `1e` rather than `1e3` + | UnknownLexeme + | TextLiteralMissingClosingQuote String + | InvalidEscapeCharacter Char + | LayoutError + | CloseWithoutMatchingOpen String String -- open, close + | UnexpectedDelimiter String + | UnexpectedTokens String -- Catch-all for all other lexer errors, representing some unexpected tokens. + deriving stock (Eq, Ord, Show) -- richer algebra + -- Design principle: -- `[Lexeme]` should be sufficient information for parsing without -- further knowledge of spacing or indentation levels @@ -80,11 +117,20 @@ data Lexeme | Bytes Bytes.Bytes -- bytes literals | Hash ShortHash -- hash literals | Err Err - | Doc (Doc.UntitledSection (Doc.Tree [Token Lexeme])) + | Doc (Doc.UntitledSection (Doc.Tree (HQ'.HashQualified Name) [Token Lexeme])) deriving stock (Eq, Show, Ord) type IsVirtual = Bool -- is it a virtual semi or an actual semi? +-- Committed failure +err :: (P.TraversableStream s, P.MonadParsec (Token Err) s m) => Pos -> Err -> m x +err start t = do + stop <- posP + -- This consumes a character and therefore produces committed failure, + -- so `err s t <|> p2` won't try `p2` + _ <- void P.anySingle <|> P.eof + P.customFailure (Token t start stop) + token :: P Lexeme -> P [Token Lexeme] token = token' (\a start end -> [Token a start end]) @@ -230,7 +276,7 @@ lexer scope rem = (P.EndOfInput) -> "end of input" customErrs es = [Err <$> e | P.ErrorCustom e <- toList es] toPos (P.SourcePos _ line col) = Pos (P.unPos line) (P.unPos col) - env0 = ParsingEnv [] (Just scope) True [0] 0 + env0 = ParsingEnv [] (Just scope) True -- | hacky postprocessing pass to do some cleanup of stuff that's annoying to -- fix without adding more state to the lexer: @@ -306,14 +352,9 @@ doc2 = do env0 <- S.get -- Disable layout while parsing the doc block and reset the section number (docTok, closeTok) <- local - ( \env -> - env - { inLayout = False, - parentSections = 0 : (parentSections env0) - } - ) + (\env -> env {inLayout = False}) do - body <- Doc.untitledSection . Doc.sectionElem lexemes' . P.lookAhead $ () <$ lit "}}" + body <- Doc.doc identifierP lexemes' . P.lookAhead $ () <$ lit "}}" closeStart <- posP lit "}}" closeEnd <- posP @@ -674,6 +715,27 @@ tok p = do token <- tokenP p pure [token] +-- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is +-- symboly (comprised of only symbols) or wordy (comprised of only alphanums). +-- +-- Examples: +-- +-- foo +-- .foo.++.doc +-- `.`.`..` (This is a two-segment identifier without a leading dot: "." then "..") +identifierP :: (Monad m) => P.ParsecT (Token Err) String m (HQ'.HashQualified Name) +identifierP = do + P.label "identifier (ex: abba1, snake_case, .foo.bar#xyz, .foo.++#xyz, or 🌻)" do + name <- PI.withParsecT (fmap nameSegmentParseErrToErr) Name.nameP + P.optional shortHashP <&> \case + Nothing -> HQ'.fromName name + Just shorthash -> HQ'.HashQualified name shorthash + where + nameSegmentParseErrToErr :: NameSegment.ParseErr -> Err + nameSegmentParseErrToErr = \case + NameSegment.ReservedOperator s -> ReservedSymbolyId (Text.unpack s) + NameSegment.ReservedWord s -> ReservedWordyId (Text.unpack s) + -- An identifier is a non-empty dot-delimited list of segments, with an optional leading dot, where each segment is -- symboly (comprised of only symbols) or wordy (comprised of only alphanums). -- @@ -691,6 +753,14 @@ identifierLexeme name = then SymbolyId name else WordyId name +wordyIdSegP :: P.ParsecT (Token Err) String m NameSegment +wordyIdSegP = + PI.withParsecT (fmap (ReservedWordyId . Text.unpack)) NameSegment.wordyP + +shortHashP :: P.ParsecT (Token Err) String m ShortHash +shortHashP = + PI.withParsecT (fmap (InvalidShortHash . Text.unpack)) ShortHash.shortHashP + blockDelimiter :: [String] -> P String -> P [Token Lexeme] blockDelimiter open closeP = do Token close pos1 pos2 <- tokenP closeP @@ -739,6 +809,11 @@ top :: Layout -> Column top [] = 1 top ((_, h) : _) = h +-- todo: make Layout a NonEmpty +topBlockName :: Layout -> Maybe BlockName +topBlockName [] = Nothing +topBlockName ((name, _) : _) = Just name + topLeftCorner :: Pos topLeftCorner = Pos 1 1 @@ -855,6 +930,10 @@ showEscapeChar :: Char -> Maybe Char showEscapeChar c = Map.lookup c (Map.fromList [(x, y) | (y, x) <- escapeChars]) +typeModifiersAlt :: (Alternative f) => (Text -> f a) -> f a +typeModifiersAlt f = + asum $ map f (toList typeModifiers) + debugFilePreParse :: FilePath -> IO () debugFilePreParse file = putStrLn . debugPreParse . preParse . lexer file . Text.unpack =<< readUtf8 file @@ -877,6 +956,18 @@ debugPreParse ts = show $ payload <$> ts debugPreParse' :: String -> String debugPreParse' = debugPreParse . preParse . lexer "debugPreParse" +instance EP.ShowErrorComponent (Token Err) where + showErrorComponent (Token err _ _) = go err + where + go = \case + UnexpectedTokens msg -> msg + CloseWithoutMatchingOpen open close -> "I found a closing " <> close <> " but no matching " <> open <> "." + Both e1 e2 -> go e1 <> "\n" <> go e2 + LayoutError -> "Indentation error" + TextLiteralMissingClosingQuote s -> "This text literal missing a closing quote: " <> excerpt s + e -> show e + excerpt s = if length s < 15 then s else take 15 s <> "..." + instance P.VisualStream [Token Lexeme] where showTokens _ xs = join . Nel.toList . S.evalState (traverse go xs) . end $ Nel.head xs diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index 098caab1b6..fac55142de 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -401,7 +401,7 @@ string = queryToken getString getString (L.Textual s) = Just (Text.pack s) getString _ = Nothing -doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection (Doc.Tree [L.Token L.Lexeme]))) +doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection (Doc.Tree (HQ'.HashQualified Name) [L.Token L.Lexeme]))) doc = queryToken \case L.Doc d -> pure d _ -> Nothing diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs index 99122bd5ff..4009c30dec 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -1,5 +1,17 @@ +-- | The parser for Unison’s @Doc@ syntax. +-- +-- This is completely independent of the Unison language, and requires a couple parsers to be passed in to then +-- provide a parser for @Doc@ applied to any host language. +-- +-- - an identifer parser +-- - a code parser (that accepts a termination parser) +-- - a termination parser (only used for lookahead), for this parser to know when to give up +-- +-- Each of those parsers is expected to satisfy @(`Ord` e, `P.MonadParsec` e `String` m)@. module Unison.Syntax.Parser.Doc ( Tree, + initialState, + doc, untitledSection, sectionElem, leaf, @@ -55,10 +67,7 @@ import Text.Megaparsec.Char.Lexer qualified as LP import Unison.Parser.Ann (Ann, Annotated (..)) import Unison.Prelude hiding (join) import Unison.Syntax.Lexer - ( P, - ParsingEnv (..), - column, - identifierP, + ( column, line, lit, local, @@ -73,28 +82,58 @@ import Unison.Syntax.Lexer import Unison.Syntax.Lexer.Token (Token (Token), posP, tokenP) import Unison.Syntax.Parser.Doc.Data -type Tree code = Cofree (Top code) Ann +type Tree ident code = Cofree (Top ident code) Ann + +data ParsingEnv = ParsingEnv + { -- | Use a stack to remember the parent section and allow docSections within docSections. + -- - 1 means we are inside a # Heading 1 + parentSections :: [Int], + -- | 4 means we are inside a list starting at the fourth column + parentListColumn :: Int + } + deriving (Show) + +initialState :: ParsingEnv +initialState = ParsingEnv [0] 0 + +doc :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (UntitledSection (Tree ident code)) +doc ident code = flip S.evalStateT initialState . untitledSection . sectionElem ident code -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). -untitledSection :: P a -> P (UntitledSection a) +untitledSection :: (P.MonadParsec e String m) => m a -> m (UntitledSection a) untitledSection a = UntitledSection <$> P.many (a <* CP.space) -wordyKw :: String -> P String +wordyKw :: (P.MonadParsec e String m) => String -> m String wordyKw kw = separated wordySep (lit kw) -sectionElem :: (Annotated code) => (P () -> P code) -> P () -> P (Tree code) -sectionElem code docClose = +sectionElem :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + S.StateT ParsingEnv m (Tree ident code) +sectionElem ident code docClose = fmap wrap' $ - section code docClose - <|> P.label "block eval (syntax: a fenced code block)" (eval code <|> exampleBlock code <|> codeBlock) - <|> list code docClose - <|> paragraph code docClose - -paragraph :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) -paragraph code = fmap Paragraph . spaced . leafy code - -word :: P end -> P (Leaf code void) + section ident code docClose + <|> lift (P.label "block eval (syntax: a fenced code block)" (eval code <|> exampleBlock code <|> codeBlock)) + <|> list ident code docClose + <|> lift (paragraph ident code docClose) + +paragraph :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Top ident code (Tree ident code)) +paragraph ident code = fmap Paragraph . spaced . leafy ident code + +word :: (Ord e, P.MonadParsec e String m) => m end -> m (Leaf ident code void) word closing = fmap Word . tokenP . P.try $ do let end = P.lookAhead $ void (P.satisfy isSpace) <|> void closing word <- P.manyTill (P.satisfy (\ch -> not (isSpace ch))) end @@ -103,43 +142,56 @@ word closing = fmap Word . tokenP . P.try $ do where reserved word = List.isPrefixOf "}}" word || all (== '#') word -leaf :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -leaf code closing = - do - link - <|> namedLink code closing - <|> example code - <|> transclude code - <|> bold code closing - <|> italic code closing - <|> strikethrough code closing +leaf :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +leaf ident code closing = + link ident + <|> namedLink ident code closing + <|> example code + <|> transclude code + <|> bold ident code closing + <|> italic ident code closing + <|> strikethrough ident code closing <|> verbatim - <|> source code - <|> foldedSource code + <|> source ident code + <|> foldedSource ident code <|> evalInline code - <|> signatures - <|> signatureInline + <|> signatures ident + <|> signatureInline ident <|> word closing -leafy :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -leafy code closing = do - p <- leaf code closing - after <- P.optional . P.try $ leafy code closing +leafy :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +leafy ident code closing = do + p <- leaf ident code closing + after <- P.optional . P.try $ leafy ident code closing case after of Nothing -> pure p Just after -> group . pure $ p :| pure after -comma :: P String +comma :: (P.MonadParsec e String m) => m String comma = lit "," <* CP.space -source :: (P () -> P code) -> P (Leaf code a) -source = fmap Source . (lit "@source" *>) . sourceElements +source :: (Ord e, P.MonadParsec e String m) => m ident -> (m () -> m code) -> m (Leaf ident code a) +source ident = fmap Source . (lit "@source" *>) . sourceElements ident -foldedSource :: (P () -> P code) -> P (Leaf code a) -foldedSource = fmap FoldedSource . (lit "@foldedSource" *>) . sourceElements +foldedSource :: (Ord e, P.MonadParsec e String m) => m ident -> (m () -> m code) -> m (Leaf ident code a) +foldedSource ident = fmap FoldedSource . (lit "@foldedSource" *>) . sourceElements ident -sourceElements :: (P () -> P code) -> P (NonEmpty (SourceElement (Leaf code Void))) -sourceElements code = do +sourceElements :: + (Ord e, P.MonadParsec e String m) => + m ident -> + (m () -> m code) -> + m (NonEmpty (SourceElement ident (Leaf ident code Void))) +sourceElements ident code = do _ <- (lit " {" <|> lit "{") *> CP.space s <- sepBy1' srcElem comma _ <- lit "}" @@ -147,49 +199,48 @@ sourceElements code = do where srcElem = SourceElement - <$> embedLink + <$> embedLink ident <*> ( fmap (fromMaybe []) . P.optional $ (lit "@") *> (CP.space *> annotations) ) where - annotation = fmap Left (tokenP identifierP) <|> fmap Right (transclude code) <* CP.space - annotations = - P.some (EmbedAnnotation <$> annotation) + annotation = fmap Left (tokenP ident) <|> fmap Right (transclude code) <* CP.space + annotations = P.some (EmbedAnnotation <$> annotation) -signatures :: P (Leaf code a) -signatures = fmap Signature $ do +signatures :: (Ord e, P.MonadParsec e String m) => m ident -> m (Leaf ident code a) +signatures ident = fmap Signature $ do _ <- (lit "@signatures" <|> lit "@signature") *> (lit " {" <|> lit "{") *> CP.space - s <- sepBy1' embedSignatureLink comma + s <- sepBy1' (embedSignatureLink ident) comma _ <- lit "}" pure s -signatureInline :: P (Leaf code a) -signatureInline = fmap SignatureInline $ do +signatureInline :: (Ord e, P.MonadParsec e String m) => m ident -> m (Leaf ident code a) +signatureInline ident = fmap SignatureInline $ do _ <- lit "@inlineSignature" *> (lit " {" <|> lit "{") *> CP.space - s <- embedSignatureLink + s <- embedSignatureLink ident _ <- lit "}" pure s -evalInline :: (P () -> P a1) -> P (Leaf a1 a2) +evalInline :: (P.MonadParsec e String m) => (m () -> m code) -> m (Leaf ident code a) evalInline code = fmap EvalInline $ do _ <- lit "@eval" *> (lit " {" <|> lit "{") *> CP.space let inlineEvalClose = void $ lit "}" s <- code inlineEvalClose pure s -embedTypeLink :: P EmbedLink -embedTypeLink = +embedTypeLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) +embedTypeLink ident = EmbedTypeLink <$> do _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tokenP identifierP <* CP.space + tokenP ident <* CP.space -embedTermLink :: P EmbedLink -embedTermLink = EmbedTermLink <$> tokenP identifierP <* CP.space +embedTermLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) +embedTermLink ident = EmbedTermLink <$> tokenP ident <* CP.space -embedSignatureLink :: P EmbedSignatureLink -embedSignatureLink = EmbedSignatureLink <$> tokenP identifierP <* CP.space +embedSignatureLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedSignatureLink ident) +embedSignatureLink ident = EmbedSignatureLink <$> tokenP ident <* CP.space -verbatim :: P (Leaf code a) +verbatim :: (Ord e, P.MonadParsec e String m) => m (Leaf ident code a) verbatim = P.label "code (examples: ''**unformatted**'', `words` or '''_words_''')" $ do Token originalText start stop <- tokenP do @@ -199,21 +250,17 @@ verbatim = quotes <- tick <|> (lit "''" <+> P.takeWhileP Nothing (== '\'')) P.someTill P.anySingle (lit quotes) let isMultiLine = line start /= line stop - if isMultiLine - then do - let trimmed = (trimAroundDelimiters originalText) - let txt = trimIndentFromVerbatimBlock (column start - 1) trimmed - -- If it's a multi-line verbatim block we trim any whitespace representing - -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' - pure . Verbatim $ - Word $ - Token txt start stop - else - pure . Code $ - Word $ - Token originalText start stop - -example :: (P () -> P code) -> P (Leaf code void) + pure + if isMultiLine + then + let trimmed = (trimAroundDelimiters originalText) + txt = trimIndentFromVerbatimBlock (column start - 1) trimmed + in -- If it's a multi-line verbatim block we trim any whitespace representing + -- indentation from the pretty-printer. See 'trimIndentFromVerbatimBlock' + Verbatim . Word $ Token txt start stop + else Code . Word $ Token originalText start stop + +example :: (P.MonadParsec e String m) => (m () -> m code) -> m (Leaf ident code void) example code = P.label "inline code (examples: ``List.map f xs``, ``[1] :+ 2``)" $ fmap Example $ do @@ -223,20 +270,20 @@ example code = let end = void . lit $ replicate (n + 1) '`' CP.space *> code end -link :: P (Leaf a b) -link = P.label "link (examples: {type List}, {Nat.+})" $ Link <$> P.try (lit "{" *> embedLink <* lit "}") +link :: (Ord e, P.MonadParsec e String m) => m ident -> m (Leaf ident code a) +link ident = P.label "link (examples: {type List}, {Nat.+})" $ Link <$> P.try (lit "{" *> embedLink ident <* lit "}") -transclude :: (P () -> P code) -> P (Leaf code x) +transclude :: (P.MonadParsec e String m) => (m () -> m code) -> m (Leaf ident code a) transclude code = fmap Transclude . P.label "transclusion (examples: {{ doc2 }}, {{ sepBy s [doc1, doc2] }})" $ lit "{{" *> code (void $ lit "}}") -nonNewlineSpaces :: P String +nonNewlineSpaces :: (P.MonadParsec e String m) => m String nonNewlineSpaces = P.takeWhileP Nothing nonNewlineSpace where nonNewlineSpace ch = isSpace ch && ch /= '\n' && ch /= '\r' -eval :: (Annotated code) => (P () -> P code) -> P (Top code (Tree code)) +eval :: (P.MonadParsec e String m, Annotated code) => (m () -> m code) -> m (Top ident code (Tree ident code)) eval code = Eval <$> do -- commit after seeing that ``` is on its own line @@ -246,7 +293,7 @@ eval code = fence <$ guard b CP.space *> code (void $ lit fence) -exampleBlock :: (Annotated code) => (P () -> P code) -> P (Top code (Tree code)) +exampleBlock :: (P.MonadParsec e String m, Annotated code) => (m () -> m code) -> m (Top ident code (Tree ident code)) exampleBlock code = ExampleBlock <$> do @@ -254,7 +301,7 @@ exampleBlock code = fence <- lit "```" <+> P.takeWhileP Nothing (== '`') code . void $ lit fence -codeBlock :: P (Top code (Tree code)) +codeBlock :: (Ord e, P.MonadParsec e String m) => m (Top ident code (Tree ident code)) codeBlock = do column <- (\x -> x - 1) . toInteger . P.unPos <$> LP.indentLevel let tabWidth = toInteger . P.unPos $ P.defaultTabWidth @@ -280,8 +327,14 @@ codeBlock = do skip _ s = s in List.intercalate "\n" $ skip column <$> lines s -emphasis :: (Annotated code) => Char -> (P () -> P code) -> P () -> P (Tree code) -emphasis delimiter code closing = do +emphasis :: + (Ord e, P.MonadParsec e String m, Annotated code) => + Char -> + m ident -> + (m () -> m code) -> + m () -> + m (Tree ident code) +emphasis delimiter ident code closing = do let start = some (P.satisfy (== delimiter)) end <- P.try $ do end <- start @@ -289,38 +342,57 @@ emphasis delimiter code closing = do pure end wrap' . Paragraph <$> someTill' - (leafy code (closing <|> (void $ lit end)) <* void whitespaceWithoutParagraphBreak) + (leafy ident code (closing <|> (void $ lit end)) <* void whitespaceWithoutParagraphBreak) (lit end) where - -- Allows whitespace or a newline, but not more than two newlines in a row. - whitespaceWithoutParagraphBreak :: P () + -- Allows whitespace including up to one newline whitespaceWithoutParagraphBreak = void do void nonNewlineSpaces optional newline >>= \case Just _ -> void nonNewlineSpaces Nothing -> pure () -bold :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -bold code = fmap Bold . emphasis '*' code - -italic :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -italic code = fmap Italic . emphasis '_' code - -strikethrough :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -strikethrough code = fmap Strikethrough . emphasis '~' code - -namedLink :: (Annotated code) => (P () -> P code) -> P () -> P (Leaf code (Tree code)) -namedLink code docClose = +bold :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +bold ident code = fmap Bold . emphasis '*' ident code + +italic :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +italic ident code = fmap Italic . emphasis '_' ident code + +strikethrough :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +strikethrough ident code = fmap Strikethrough . emphasis '~' ident code + +namedLink :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + m (Leaf ident code (Tree ident code)) +namedLink ident code docClose = P.label "hyperlink (example: [link name](https://destination.com))" do _ <- lit "[" - p <- spaced . leafy code . void $ char ']' + p <- spaced . leafy ident code . void $ char ']' _ <- lit "]" _ <- lit "(" - target <- group $ fmap pure link <|> some' (transclude code <|> word (docClose <|> void (char ')'))) + target <- group $ fmap pure (link ident) <|> some' (transclude code <|> word (docClose <|> void (char ')'))) _ <- lit ")" pure $ NamedLink (wrap' $ Paragraph p) target -sp :: P String +sp :: (P.MonadParsec e String m) => m String sp = P.try $ do spaces <- P.takeWhile1P (Just "space") isSpace close <- P.optional (P.lookAhead (lit "}}")) @@ -331,17 +403,22 @@ sp = P.try $ do where ok s = length [() | '\n' <- s] < 2 -spaced :: P a -> P (NonEmpty a) +spaced :: (P.MonadParsec e String m) => m a -> m (NonEmpty a) spaced p = some' (p <* P.optional sp) -- | Not an actual node, but this pattern is referenced in multiple places -list :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) -list code docClose = bulletedList code docClose <|> numberedList code docClose - -listSep :: P () +list :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + S.StateT ParsingEnv m (Top ident code (Tree ident code)) +list ident code docClose = bulletedList ident code docClose <|> numberedList ident code docClose + +listSep :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m () listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) -bulletedStart :: P (Int, [a]) +bulletedStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m (Int, [a]) bulletedStart = P.try $ do r <- listItemStart $ [] <$ P.satisfy bulletChar P.lookAhead (P.satisfy isSpace) @@ -349,43 +426,59 @@ bulletedStart = P.try $ do where bulletChar ch = ch == '*' || ch == '-' || ch == '+' -listItemStart :: P a -> P (Int, a) -listItemStart gutter = P.try $ do +listItemStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m a -> m (Int, a) +listItemStart gutter = P.try do nonNewlineSpaces col <- column <$> posP parentCol <- S.gets parentListColumn guard (col > parentCol) (col,) <$> gutter -numberedStart :: P (Int, Token Word64) +numberedStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m (Int, Token Word64) numberedStart = listItemStart $ P.try (tokenP $ LP.decimal <* lit ".") -- | FIXME: This should take a @`P` a@ -numberedList :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) -numberedList code docClose = NumberedList <$> sepBy1' numberedItem listSep +numberedList :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + S.StateT ParsingEnv m (Top ident code (Tree ident code)) +numberedList ident code docClose = NumberedList <$> sepBy1' numberedItem listSep where numberedItem = P.label "numbered list (examples: 1. item1, 8. start numbering at '8')" do (col, s) <- numberedStart - (s,) <$> column' code docClose col + (s,) <$> column' ident code docClose col -- | FIXME: This should take a @`P` a@ -bulletedList :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) -bulletedList code docClose = BulletedList <$> sepBy1' bullet listSep +bulletedList :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + S.StateT ParsingEnv m (Top ident code (Tree ident code)) +bulletedList ident code docClose = BulletedList <$> sepBy1' bullet listSep where bullet = P.label "bullet (examples: * item1, - item2)" do (col, _) <- bulletedStart - column' code docClose col - -column' :: (Annotated code) => (P () -> P code) -> P () -> Int -> P (Column (Tree code)) -column' code docClose col = + column' ident code docClose col + +column' :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + Int -> + S.StateT ParsingEnv m (Column (Tree ident code)) +column' ident code docClose col = Column . wrap' <$> (nonNewlineSpaces *> listItemParagraph) - <*> local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> fmap wrap' (list code docClose)) + <*> local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> fmap wrap' (list ident code docClose)) where listItemParagraph = Paragraph <$> do col <- column <$> posP - some' (leafy code docClose <* sep col) + some' (lift (leafy ident code docClose) <* sep col) where -- Trickiness here to support hard line breaks inside of -- a bulleted list, so for instance this parses as expected: @@ -406,7 +499,7 @@ column' code docClose col = (P.notFollowedBy $ void numberedStart <|> void bulletedStart) pure () -newline :: P String +newline :: (P.MonadParsec e String m) => m String newline = P.label "newline" $ lit "\n" <|> lit "\r\n" -- | @@ -421,32 +514,37 @@ newline = P.label "newline" $ lit "\n" <|> lit "\r\n" -- > A paragraph under this subsection. -- > -- > # A section title (not a subsection) -section :: (Annotated code) => (P () -> P code) -> P () -> P (Top code (Tree code)) -section code docClose = do +section :: + (Ord e, P.MonadParsec e String m, Annotated code) => + m ident -> + (m () -> m code) -> + m () -> + S.StateT ParsingEnv m (Top ident code (Tree ident code)) +section ident code docClose = do ns <- S.gets parentSections hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp - title <- paragraph code docClose <* CP.space + title <- lift $ paragraph ident code docClose <* CP.space let m = length hashes + head ns body <- local (\env -> env {parentSections = (m : (tail ns))}) $ - P.many (sectionElem code docClose <* CP.space) + P.many (sectionElem ident code docClose <* CP.space) pure $ Section (wrap' title) body -- | Not an actual node, but this pattern is referenced in multiple places -embedLink :: P EmbedLink -embedLink = embedTypeLink <|> embedTermLink +embedLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) +embedLink ident = embedTypeLink ident <|> embedTermLink ident -- | FIXME: This should just take a @`P` code@ and @`P` a@. -group :: P (NonEmpty (Leaf code a)) -> P (Leaf code a) +group :: (P.MonadParsec e s m) => m (NonEmpty (Leaf ident code a)) -> m (Leaf ident code a) group = fmap Group . join -- | FIXME: This should just take a @`P` a@ -join :: P (NonEmpty a) -> P (Join a) +join :: (P.MonadParsec e s m) => m (NonEmpty a) -> m (Join a) join = fmap Join -- * utility functions -wrap' :: (Annotated code) => Top code (Tree code) -> Tree code +wrap' :: (Annotated code) => Top ident code (Tree ident code) -> Tree ident code wrap' doc = ann doc :< doc -- | If it's a multi-line verbatim block we trim any whitespace representing diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs index 5167b2bcf6..56a14939b6 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs @@ -1,7 +1,11 @@ {-# LANGUAGE TemplateHaskell #-} +{-# OPTIONS_GHC -Wno-orphans #-} -- | Haskell parallel to @unison/base.Doc@. -- +-- These types have two significant parameters: @ident@ and @code@ that are expected to be parameterized by some +-- representation of identifiers and source code of the host language. +-- -- This is much more restricted than @unison/base.Doc@, but it covers everything we can parse from Haskell. The -- mismatch with Unison is a problem, as someone can create a Unison Doc with explicit constructors or function calls, -- have it rendered to a scratch file, and then we can’t parse it. Changing the types here to match Unison wouldn’t @@ -13,8 +17,6 @@ import Data.Eq.Deriving (deriveEq1, deriveEq2) import Data.List.NonEmpty (NonEmpty) import Data.Ord.Deriving (deriveOrd1, deriveOrd2) import Text.Show.Deriving (deriveShow1, deriveShow2) -import Unison.HashQualifiedPrime qualified as HQ' -import Unison.Name (Name) import Unison.Parser.Ann (Annotated (..)) import Unison.Prelude import Unison.Syntax.Lexer.Token (Token (..)) @@ -22,7 +24,7 @@ import Unison.Syntax.Lexer.Token (Token (..)) newtype UntitledSection a = UntitledSection [a] deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -data Top code a +data Top ident code a = -- | The first argument is always a `Paragraph` Section a [a] | Eval code @@ -30,7 +32,7 @@ data Top code a | CodeBlock (Token String) (Token String) | BulletedList (NonEmpty (Column a)) | NumberedList (NonEmpty (Token Word64, Column a)) - | Paragraph (NonEmpty (Leaf code a)) + | Paragraph (NonEmpty (Leaf ident code a)) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) data Column a @@ -38,11 +40,11 @@ data Column a Column a (Maybe a) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -data Leaf code a - = Link EmbedLink +data Leaf ident code a + = Link (EmbedLink ident) | -- | first is a Paragraph, second is always a Group (which contains either a single Term/Type link or list of -- `Transclude`s & `Word`s) - NamedLink a (Leaf code Void) + NamedLink a (Leaf ident code Void) | Example code | Transclude code | -- | Always a Paragraph @@ -52,21 +54,21 @@ data Leaf code a | -- | Always a Paragraph Strikethrough a | -- | Always a Word - Verbatim (Leaf Void Void) + Verbatim (Leaf ident Void Void) | -- | Always a Word - Code (Leaf Void Void) + Code (Leaf ident Void Void) | -- | Always a Transclude - Source (NonEmpty (SourceElement (Leaf code Void))) + Source (NonEmpty (SourceElement ident (Leaf ident code Void))) | -- | Always a Transclude - FoldedSource (NonEmpty (SourceElement (Leaf code Void))) + FoldedSource (NonEmpty (SourceElement ident (Leaf ident code Void))) | EvalInline code - | Signature (NonEmpty EmbedSignatureLink) - | SignatureInline EmbedSignatureLink + | Signature (NonEmpty (EmbedSignatureLink ident)) + | SignatureInline (EmbedSignatureLink ident) | Word (Token String) - | Group (Join (Leaf code a)) + | Group (Join (Leaf ident code a)) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -instance Bifunctor Leaf where +instance Bifunctor (Leaf ident) where bimap f g = \case Link x -> Link x NamedLink a leaf -> NamedLink (g a) $ first f leaf @@ -85,25 +87,25 @@ instance Bifunctor Leaf where Word x -> Word x Group join -> Group $ bimap f g <$> join -data EmbedLink - = EmbedTypeLink (Token (HQ'.HashQualified Name)) - | EmbedTermLink (Token (HQ'.HashQualified Name)) +data EmbedLink ident + = EmbedTypeLink (Token ident) + | EmbedTermLink (Token ident) deriving (Eq, Ord, Show) -data SourceElement a = SourceElement EmbedLink [EmbedAnnotation a] +data SourceElement ident a = SourceElement (EmbedLink ident) [EmbedAnnotation ident a] deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -newtype EmbedSignatureLink = EmbedSignatureLink (Token (HQ'.HashQualified Name)) +newtype EmbedSignatureLink ident = EmbedSignatureLink (Token ident) deriving (Eq, Ord, Show) newtype Join a = Join (NonEmpty a) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -newtype EmbedAnnotation a - = EmbedAnnotation (Either (Token (HQ'.HashQualified Name)) a) +newtype EmbedAnnotation ident a + = EmbedAnnotation (Either (Token ident) a) deriving (Eq, Ord, Show, Foldable, Functor, Traversable) -instance (Annotated code, Annotated a) => Annotated (Top code a) where +instance (Annotated code, Annotated a) => Annotated (Top ident code a) where ann = \case Section title body -> ann title <> ann body Eval code -> ann code @@ -116,7 +118,7 @@ instance (Annotated code, Annotated a) => Annotated (Top code a) where instance (Annotated a) => Annotated (Column a) where ann (Column para list) = ann para <> ann list -instance (Annotated code, Annotated a) => Annotated (Leaf code a) where +instance (Annotated code, Annotated a) => Annotated (Leaf ident code a) where ann = \case Link link -> ann link NamedLink label target -> ann label <> ann target @@ -135,31 +137,45 @@ instance (Annotated code, Annotated a) => Annotated (Leaf code a) where Word text -> ann text Group (Join leaves) -> ann leaves -instance Annotated EmbedLink where +instance Annotated (EmbedLink ident) where ann = \case EmbedTypeLink name -> ann name EmbedTermLink name -> ann name -instance (Annotated code) => Annotated (SourceElement code) where +instance (Annotated code) => Annotated (SourceElement ident code) where ann (SourceElement link target) = ann link <> ann target -instance Annotated EmbedSignatureLink where +instance Annotated (EmbedSignatureLink ident) where ann (EmbedSignatureLink name) = ann name -instance (Annotated code) => Annotated (EmbedAnnotation code) where +instance (Annotated code) => Annotated (EmbedAnnotation ident code) where ann (EmbedAnnotation a) = either ann ann a $(deriveEq1 ''Column) $(deriveOrd1 ''Column) $(deriveShow1 ''Column) +$(deriveEq1 ''Token) +$(deriveOrd1 ''Token) +$(deriveShow1 ''Token) + $(deriveEq1 ''EmbedAnnotation) $(deriveOrd1 ''EmbedAnnotation) $(deriveShow1 ''EmbedAnnotation) +$(deriveEq2 ''EmbedAnnotation) +$(deriveOrd2 ''EmbedAnnotation) +$(deriveShow2 ''EmbedAnnotation) + +$(deriveEq1 ''EmbedLink) +$(deriveOrd1 ''EmbedLink) +$(deriveShow1 ''EmbedLink) $(deriveEq1 ''SourceElement) $(deriveOrd1 ''SourceElement) $(deriveShow1 ''SourceElement) +$(deriveEq2 ''SourceElement) +$(deriveOrd2 ''SourceElement) +$(deriveShow2 ''SourceElement) $(deriveEq1 ''Join) $(deriveOrd1 ''Join) From 9a941a389079373465ecdd6b6baad8936f3561b1 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Fri, 26 Jul 2024 23:53:49 -0600 Subject: [PATCH 19/22] Caught a hardcoded `}}` in the Doc parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Doc parser shouldn’t know how Unison terminates Doc blocks. --- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs index 4009c30dec..cecf2ca6a2 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -131,7 +131,7 @@ paragraph :: (m () -> m code) -> m () -> m (Top ident code (Tree ident code)) -paragraph ident code = fmap Paragraph . spaced . leafy ident code +paragraph ident code docClose = fmap Paragraph . spaced docClose $ leafy ident code docClose word :: (Ord e, P.MonadParsec e String m) => m end -> m (Leaf ident code void) word closing = fmap Word . tokenP . P.try $ do @@ -385,17 +385,17 @@ namedLink :: namedLink ident code docClose = P.label "hyperlink (example: [link name](https://destination.com))" do _ <- lit "[" - p <- spaced . leafy ident code . void $ char ']' + p <- spaced docClose . leafy ident code . void $ char ']' _ <- lit "]" _ <- lit "(" target <- group $ fmap pure (link ident) <|> some' (transclude code <|> word (docClose <|> void (char ')'))) _ <- lit ")" pure $ NamedLink (wrap' $ Paragraph p) target -sp :: (P.MonadParsec e String m) => m String -sp = P.try $ do +sp :: (P.MonadParsec e String m) => m () -> m String +sp docClose = P.try $ do spaces <- P.takeWhile1P (Just "space") isSpace - close <- P.optional (P.lookAhead (lit "}}")) + close <- P.optional (P.lookAhead docClose) case close of Nothing -> guard $ ok spaces Just _ -> pure () @@ -403,8 +403,8 @@ sp = P.try $ do where ok s = length [() | '\n' <- s] < 2 -spaced :: (P.MonadParsec e String m) => m a -> m (NonEmpty a) -spaced p = some' (p <* P.optional sp) +spaced :: (P.MonadParsec e String m) => m () -> m a -> m (NonEmpty a) +spaced docClose p = some' $ p <* P.optional (sp docClose) -- | Not an actual node, but this pattern is referenced in multiple places list :: @@ -522,7 +522,7 @@ section :: S.StateT ParsingEnv m (Top ident code (Tree ident code)) section ident code docClose = do ns <- S.gets parentSections - hashes <- P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp + hashes <- lift $ P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp docClose title <- lift $ paragraph ident code docClose <* CP.space let m = length hashes + head ns body <- From beecaa9be715f2090b01577e12b2e541a60404b6 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Fri, 26 Jul 2024 23:54:54 -0600 Subject: [PATCH 20/22] Make Doc parser ignorant of type/term distinctions This was the last thing tying Doc to Unison. --- parser-typechecker/src/Unison/PrintError.hs | 8 ++++ .../src/Unison/Syntax/TermParser.hs | 43 +++++++++++++------ .../src/Unison/Syntax/Lexer/Unison.hs | 12 ++++-- unison-syntax/src/Unison/Syntax/Parser.hs | 6 ++- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 36 +++------------- .../src/Unison/Syntax/Parser/Doc/Data.hs | 10 ++--- 6 files changed, 59 insertions(+), 56 deletions(-) diff --git a/parser-typechecker/src/Unison/PrintError.hs b/parser-typechecker/src/Unison/PrintError.hs index dd796c0159..691d7cd3ef 100644 --- a/parser-typechecker/src/Unison/PrintError.hs +++ b/parser-typechecker/src/Unison/PrintError.hs @@ -1861,6 +1861,14 @@ renderParseErrors s = \case <> structuralVsUniqueDocsLink ] in (msg, rangeForToken <$> [void keyword, void name]) + go (Parser.TypeNotAllowed tok) = + let msg = + Pr.lines + [ Pr.wrap "I expected to see a term here, but instead it’s a type:", + "", + tokenAsErrorSite s $ HQ.toText <$> tok + ] + in (msg, [rangeForToken tok]) unknownConstructor :: String -> L.Token (HashQualified Name) -> Pretty ColorText diff --git a/parser-typechecker/src/Unison/Syntax/TermParser.hs b/parser-typechecker/src/Unison/Syntax/TermParser.hs index 8d0195410a..642ed0e339 100644 --- a/parser-typechecker/src/Unison/Syntax/TermParser.hs +++ b/parser-typechecker/src/Unison/Syntax/TermParser.hs @@ -27,6 +27,7 @@ import Data.Text qualified as Text import Data.Tuple.Extra qualified as TupleE import Data.Void (absurd, vacuous) import Text.Megaparsec qualified as P +import U.Codebase.Reference (ReferenceType (..)) import U.Core.ABT qualified as ABT import Unison.ABT qualified as ABT import Unison.Builtin.Decls qualified as DD @@ -530,7 +531,7 @@ doc2Block = do docUntitledSection ann (Doc.UntitledSection tops) = Term.app ann (f ann "UntitledSection") $ Term.list (gann tops) tops - docTop :: Doc.Top (HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m + docTop :: Doc.Top (ReferenceType, HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m docTop d = case d of Doc.Section title body -> pure $ Term.apps' (f d "Section") [title, Term.list (gann body) body] Doc.Eval code -> @@ -558,7 +559,7 @@ doc2Block = do docColumn d@(Doc.Column para sublist) = Term.app (gann d) (f d "Column") . Term.list (gann d) $ para : toList sublist - docLeaf :: Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m + docLeaf :: Doc.Leaf (ReferenceType, HQ'.HashQualified Name) [L.Token L.Lexeme] (Term v Ann) -> TermP v m docLeaf d = case d of Doc.Link link -> Term.app (gann d) (f d "Link") <$> docEmbedLink link Doc.NamedLink para target -> Term.apps' (f d "NamedLink") . (para :) . pure <$> docLeaf (vacuous target) @@ -590,35 +591,49 @@ doc2Block = do Term.app (gann d) (f d "Group") . Term.app (gann d) (f d "Join") . Term.list (ann leaves) . toList <$> traverse docLeaf leaves - docEmbedLink :: Doc.EmbedLink (HQ'.HashQualified Name) -> TermP v m - docEmbedLink d = case d of - Doc.EmbedTypeLink ident -> + docEmbedLink :: Doc.EmbedLink (ReferenceType, HQ'.HashQualified Name) -> TermP v m + docEmbedLink d@(Doc.EmbedLink (L.Token (level, ident) start end)) = case level of + RtType -> Term.app (gann d) (f d "EmbedTypeLink") . Term.typeLink (ann d) . L.payload - <$> findUniqueType (HQ'.toHQ <$> ident) - Doc.EmbedTermLink ident -> - Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) + <$> findUniqueType (L.Token (HQ'.toHQ ident) start end) + RtTerm -> + Term.app (gann d) (f d "EmbedTermLink") . addDelay <$> resolveHashQualified (L.Token (HQ'.toHQ ident) start end) docSourceElement :: - Doc.SourceElement (HQ'.HashQualified Name) (Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> + Doc.SourceElement + (ReferenceType, HQ'.HashQualified Name) + (Doc.Leaf (ReferenceType, HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> TermP v m docSourceElement d@(Doc.SourceElement link anns) = do link' <- docEmbedLink link anns' <- traverse docEmbedAnnotation anns pure $ Term.apps' (f d "SourceElement") [link', Term.list (ann anns) anns'] - docEmbedSignatureLink :: Doc.EmbedSignatureLink (HQ'.HashQualified Name) -> TermP v m - docEmbedSignatureLink d@(Doc.EmbedSignatureLink ident) = - Term.app (gann d) (f d "EmbedSignatureLink") . addDelay <$> resolveHashQualified (HQ'.toHQ <$> ident) + docEmbedSignatureLink :: Doc.EmbedSignatureLink (ReferenceType, HQ'.HashQualified Name) -> TermP v m + docEmbedSignatureLink d@(Doc.EmbedSignatureLink (L.Token (level, ident) start end)) = case level of + RtType -> P.customFailure . TypeNotAllowed $ L.Token (HQ'.toHQ ident) start end + RtTerm -> + Term.app (gann d) (f d "EmbedSignatureLink") . addDelay + <$> resolveHashQualified (L.Token (HQ'.toHQ ident) start end) docEmbedAnnotation :: - Doc.EmbedAnnotation (HQ'.HashQualified Name) (Doc.Leaf (HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> + Doc.EmbedAnnotation + (ReferenceType, HQ'.HashQualified Name) + (Doc.Leaf (ReferenceType, HQ'.HashQualified Name) [L.Token L.Lexeme] Void) -> TermP v m docEmbedAnnotation d@(Doc.EmbedAnnotation a) = -- This is the only place I’m not sure we’re doing the right thing. In the lexer, this can be an identifier or a -- DocLeaf, but here it could be either /text/ or a Doc element. And I don’t think there’s any way the lexemes -- produced for an identifier and the lexemes consumed for text line up. So, I think this is a bugfix I can’t -- avoid. - Term.app (gann d) (f d "EmbedAnnotation") <$> either (resolveHashQualified . fmap HQ'.toHQ) (docLeaf . vacuous) a + Term.app (gann d) (f d "EmbedAnnotation") + <$> either + ( \(L.Token (level, ident) start end) -> case level of + RtType -> P.customFailure . TypeNotAllowed $ L.Token (HQ'.toHQ ident) start end + RtTerm -> resolveHashQualified $ L.Token (HQ'.toHQ ident) start end + ) + (docLeaf . vacuous) + a docBlock :: (Monad m, Var v) => TermP v m docBlock = do diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs index 98112c2124..8a6c20d1a8 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs @@ -42,6 +42,7 @@ import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Text.Megaparsec.Error qualified as EP import Text.Megaparsec.Internal qualified as PI +import U.Codebase.Reference (ReferenceType (..)) import Unison.HashQualifiedPrime qualified as HQ' import Unison.Name (Name) import Unison.Name qualified as Name @@ -117,7 +118,7 @@ data Lexeme | Bytes Bytes.Bytes -- bytes literals | Hash ShortHash -- hash literals | Err Err - | Doc (Doc.UntitledSection (Doc.Tree (HQ'.HashQualified Name) [Token Lexeme])) + | Doc (Doc.UntitledSection (Doc.Tree (ReferenceType, HQ'.HashQualified Name) [Token Lexeme])) deriving stock (Eq, Show, Ord) type IsVirtual = Bool -- is it a virtual semi or an actual semi? @@ -354,7 +355,7 @@ doc2 = do (docTok, closeTok) <- local (\env -> env {inLayout = False}) do - body <- Doc.doc identifierP lexemes' . P.lookAhead $ () <$ lit "}}" + body <- Doc.doc typeOrTerm lexemes' . P.lookAhead $ () <$ lit "}}" closeStart <- posP lit "}}" closeEnd <- posP @@ -382,12 +383,15 @@ doc2 = do isTopLevel = length (layout env0) + maybe 0 (const 1) (opening env0) == 1 _ -> docTok : endToks where - -- DUPLICATED wordyKw kw = separated wordySep (lit kw) + typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) + typeOrTerm = do + mtype <- P.optional $ typeOrAbility' <* CP.space + ident <- identifierP <* CP.space + pure (maybe RtTerm (const RtType) mtype, ident) subsequentTypeName = P.lookAhead . P.optional $ do let lit' s = lit s <* sp let modifier = typeModifiersAlt (lit' . Text.unpack) - let typeOrAbility' = typeOrAbilityAlt (wordyKw . Text.unpack) _ <- optional modifier *> typeOrAbility' *> sp Token name start stop <- tokenP identifierP if Name.isSymboly (HQ'.toName name) diff --git a/unison-syntax/src/Unison/Syntax/Parser.hs b/unison-syntax/src/Unison/Syntax/Parser.hs index fac55142de..6c4aa74b95 100644 --- a/unison-syntax/src/Unison/Syntax/Parser.hs +++ b/unison-syntax/src/Unison/Syntax/Parser.hs @@ -72,6 +72,7 @@ import Data.Set qualified as Set import Data.Text qualified as Text import Text.Megaparsec (runParserT) import Text.Megaparsec qualified as P +import U.Codebase.Reference (ReferenceType (..)) import U.Util.Base32Hex qualified as Base32Hex import Unison.ABT qualified as ABT import Unison.ConstructorReference (ConstructorReference) @@ -170,6 +171,8 @@ data Error v | TypeDeclarationErrors [UF.Error v Ann] | -- | MissingTypeModifier (type|ability) name MissingTypeModifier (L.Token String) (L.Token v) + | -- | A type was found in a position that requires a term + TypeNotAllowed (L.Token (HQ.HashQualified Name)) | ResolutionFailures [Names.ResolutionFailure v Ann] | DuplicateTypeNames [(v, [Ann])] | DuplicateTermNames [(v, [Ann])] @@ -401,7 +404,8 @@ string = queryToken getString getString (L.Textual s) = Just (Text.pack s) getString _ = Nothing -doc :: (Ord v) => P v m (L.Token (Doc.UntitledSection (Doc.Tree (HQ'.HashQualified Name) [L.Token L.Lexeme]))) +doc :: + (Ord v) => P v m (L.Token (Doc.UntitledSection (Doc.Tree (ReferenceType, HQ'.HashQualified Name) [L.Token L.Lexeme]))) doc = queryToken \case L.Doc d -> pure d _ -> Nothing diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs index cecf2ca6a2..8ba6840dd2 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -45,8 +45,7 @@ module Unison.Syntax.Parser.Doc -- * other components column', - embedTypeLink, - embedTermLink, + embedLink, embedSignatureLink, join, ) @@ -59,26 +58,13 @@ import Data.List qualified as List import Data.List.Extra qualified as List import Data.List.NonEmpty (NonEmpty ((:|))) import Data.List.NonEmpty qualified as NonEmpty -import Data.Text qualified as Text import Text.Megaparsec qualified as P import Text.Megaparsec.Char (char) import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Unison.Parser.Ann (Ann, Annotated (..)) import Unison.Prelude hiding (join) -import Unison.Syntax.Lexer - ( column, - line, - lit, - local, - sepBy1', - separated, - some', - someTill', - typeOrAbilityAlt, - wordySep, - (<+>), - ) +import Unison.Syntax.Lexer (column, line, lit, local, sepBy1', some', someTill', (<+>)) import Unison.Syntax.Lexer.Token (Token (Token), posP, tokenP) import Unison.Syntax.Parser.Doc.Data @@ -109,9 +95,6 @@ doc ident code = flip S.evalStateT initialState . untitledSection . sectionElem untitledSection :: (P.MonadParsec e String m) => m a -> m (UntitledSection a) untitledSection a = UntitledSection <$> P.many (a <* CP.space) -wordyKw :: (P.MonadParsec e String m) => String -> m String -wordyKw kw = separated wordySep (lit kw) - sectionElem :: (Ord e, P.MonadParsec e String m, Annotated code) => m ident -> @@ -228,14 +211,9 @@ evalInline code = fmap EvalInline $ do s <- code inlineEvalClose pure s -embedTypeLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) -embedTypeLink ident = - EmbedTypeLink <$> do - _ <- typeOrAbilityAlt (wordyKw . Text.unpack) <* CP.space - tokenP ident <* CP.space - -embedTermLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) -embedTermLink ident = EmbedTermLink <$> tokenP ident <* CP.space +-- | Not an actual node, but this pattern is referenced in multiple places +embedLink :: (Ord e, P.MonadParsec e s m, P.TraversableStream s) => m ident -> m (EmbedLink ident) +embedLink = fmap EmbedLink . tokenP embedSignatureLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedSignatureLink ident) embedSignatureLink ident = EmbedSignatureLink <$> tokenP ident <* CP.space @@ -530,10 +508,6 @@ section ident code docClose = do P.many (sectionElem ident code docClose <* CP.space) pure $ Section (wrap' title) body --- | Not an actual node, but this pattern is referenced in multiple places -embedLink :: (Ord e, P.MonadParsec e String m) => m ident -> m (EmbedLink ident) -embedLink ident = embedTypeLink ident <|> embedTermLink ident - -- | FIXME: This should just take a @`P` code@ and @`P` a@. group :: (P.MonadParsec e s m) => m (NonEmpty (Leaf ident code a)) -> m (Leaf ident code a) group = fmap Group . join diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs index 56a14939b6..75bc3a621e 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc/Data.hs @@ -87,9 +87,9 @@ instance Bifunctor (Leaf ident) where Word x -> Word x Group join -> Group $ bimap f g <$> join -data EmbedLink ident - = EmbedTypeLink (Token ident) - | EmbedTermLink (Token ident) +-- | This is a deviation from the Unison Doc data model – in Unison, Doc distinguishes between type and term links, but +-- here Doc knows nothing about what namespaces may exist. +data EmbedLink ident = EmbedLink (Token ident) deriving (Eq, Ord, Show) data SourceElement ident a = SourceElement (EmbedLink ident) [EmbedAnnotation ident a] @@ -138,9 +138,7 @@ instance (Annotated code, Annotated a) => Annotated (Leaf ident code a) where Group (Join leaves) -> ann leaves instance Annotated (EmbedLink ident) where - ann = \case - EmbedTypeLink name -> ann name - EmbedTermLink name -> ann name + ann (EmbedLink name) = ann name instance (Annotated code) => Annotated (SourceElement ident code) where ann (SourceElement link target) = ann link <> ann target From c5a66d5608849dbe2f4d4b9164cae34ade4c6ca1 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Mon, 29 Jul 2024 16:49:21 -0600 Subject: [PATCH 21/22] Simplify Doc parser from `State` to `Reader` --- .../src/Unison/Syntax/Lexer/Unison.hs | 2 +- unison-syntax/src/Unison/Syntax/Parser/Doc.hs | 44 +++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs index 8a6c20d1a8..c641786505 100644 --- a/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs +++ b/unison-syntax/src/Unison/Syntax/Lexer/Unison.hs @@ -355,7 +355,7 @@ doc2 = do (docTok, closeTok) <- local (\env -> env {inLayout = False}) do - body <- Doc.doc typeOrTerm lexemes' . P.lookAhead $ () <$ lit "}}" + body <- Doc.doc typeOrTerm lexemes' . P.lookAhead $ lit "}}" closeStart <- posP lit "}}" closeEnd <- posP diff --git a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs index 8ba6840dd2..1a03665493 100644 --- a/unison-syntax/src/Unison/Syntax/Parser/Doc.hs +++ b/unison-syntax/src/Unison/Syntax/Parser/Doc.hs @@ -5,12 +5,12 @@ -- -- - an identifer parser -- - a code parser (that accepts a termination parser) --- - a termination parser (only used for lookahead), for this parser to know when to give up +-- - a termination parser, for this parser to know when to give up -- -- Each of those parsers is expected to satisfy @(`Ord` e, `P.MonadParsec` e `String` m)@. module Unison.Syntax.Parser.Doc ( Tree, - initialState, + initialEnv, doc, untitledSection, sectionElem, @@ -52,7 +52,7 @@ module Unison.Syntax.Parser.Doc where import Control.Comonad.Cofree (Cofree ((:<))) -import Control.Monad.State qualified as S +import Control.Monad.Reader qualified as R import Data.Char (isControl, isSpace) import Data.List qualified as List import Data.List.Extra qualified as List @@ -64,7 +64,7 @@ import Text.Megaparsec.Char qualified as CP import Text.Megaparsec.Char.Lexer qualified as LP import Unison.Parser.Ann (Ann, Annotated (..)) import Unison.Prelude hiding (join) -import Unison.Syntax.Lexer (column, line, lit, local, sepBy1', some', someTill', (<+>)) +import Unison.Syntax.Lexer (column, line, lit, sepBy1', some', someTill', (<+>)) import Unison.Syntax.Lexer.Token (Token (Token), posP, tokenP) import Unison.Syntax.Parser.Doc.Data @@ -79,16 +79,16 @@ data ParsingEnv = ParsingEnv } deriving (Show) -initialState :: ParsingEnv -initialState = ParsingEnv [0] 0 +initialEnv :: ParsingEnv +initialEnv = ParsingEnv [0] 0 doc :: (Ord e, P.MonadParsec e String m, Annotated code) => m ident -> (m () -> m code) -> - m () -> + m end -> m (UntitledSection (Tree ident code)) -doc ident code = flip S.evalStateT initialState . untitledSection . sectionElem ident code +doc ident code = flip R.runReaderT initialEnv . untitledSection . sectionElem ident code . void -- | This is the actual `Doc` lexer. Unlike `doc2`, it doesn’t do any Unison-side lexing (i.e., it doesn’t know that -- Unison wraps `Doc` literals in `}}`). @@ -100,7 +100,7 @@ sectionElem :: m ident -> (m () -> m code) -> m () -> - S.StateT ParsingEnv m (Tree ident code) + R.ReaderT ParsingEnv m (Tree ident code) sectionElem ident code docClose = fmap wrap' $ section ident code docClose @@ -390,13 +390,13 @@ list :: m ident -> (m () -> m code) -> m () -> - S.StateT ParsingEnv m (Top ident code (Tree ident code)) + R.ReaderT ParsingEnv m (Top ident code (Tree ident code)) list ident code docClose = bulletedList ident code docClose <|> numberedList ident code docClose -listSep :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m () +listSep :: (Ord e, R.MonadReader ParsingEnv m, P.MonadParsec e String m) => m () listSep = P.try $ newline *> nonNewlineSpaces *> P.lookAhead (void bulletedStart <|> void numberedStart) -bulletedStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m (Int, [a]) +bulletedStart :: (Ord e, R.MonadReader ParsingEnv m, P.MonadParsec e String m) => m (Int, [a]) bulletedStart = P.try $ do r <- listItemStart $ [] <$ P.satisfy bulletChar P.lookAhead (P.satisfy isSpace) @@ -404,15 +404,15 @@ bulletedStart = P.try $ do where bulletChar ch = ch == '*' || ch == '-' || ch == '+' -listItemStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m a -> m (Int, a) +listItemStart :: (Ord e, R.MonadReader ParsingEnv m, P.MonadParsec e String m) => m a -> m (Int, a) listItemStart gutter = P.try do nonNewlineSpaces col <- column <$> posP - parentCol <- S.gets parentListColumn + parentCol <- R.asks parentListColumn guard (col > parentCol) (col,) <$> gutter -numberedStart :: (Ord e, S.MonadState ParsingEnv m, P.MonadParsec e String m) => m (Int, Token Word64) +numberedStart :: (Ord e, R.MonadReader ParsingEnv m, P.MonadParsec e String m) => m (Int, Token Word64) numberedStart = listItemStart $ P.try (tokenP $ LP.decimal <* lit ".") -- | FIXME: This should take a @`P` a@ @@ -421,7 +421,7 @@ numberedList :: m ident -> (m () -> m code) -> m () -> - S.StateT ParsingEnv m (Top ident code (Tree ident code)) + R.ReaderT ParsingEnv m (Top ident code (Tree ident code)) numberedList ident code docClose = NumberedList <$> sepBy1' numberedItem listSep where numberedItem = P.label "numbered list (examples: 1. item1, 8. start numbering at '8')" do @@ -434,7 +434,7 @@ bulletedList :: m ident -> (m () -> m code) -> m () -> - S.StateT ParsingEnv m (Top ident code (Tree ident code)) + R.ReaderT ParsingEnv m (Top ident code (Tree ident code)) bulletedList ident code docClose = BulletedList <$> sepBy1' bullet listSep where bullet = P.label "bullet (examples: * item1, - item2)" do @@ -447,11 +447,11 @@ column' :: (m () -> m code) -> m () -> Int -> - S.StateT ParsingEnv m (Column (Tree ident code)) + R.ReaderT ParsingEnv m (Column (Tree ident code)) column' ident code docClose col = Column . wrap' <$> (nonNewlineSpaces *> listItemParagraph) - <*> local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> fmap wrap' (list ident code docClose)) + <*> R.local (\e -> e {parentListColumn = col}) (P.optional $ listSep *> fmap wrap' (list ident code docClose)) where listItemParagraph = Paragraph <$> do @@ -497,14 +497,14 @@ section :: m ident -> (m () -> m code) -> m () -> - S.StateT ParsingEnv m (Top ident code (Tree ident code)) + R.ReaderT ParsingEnv m (Top ident code (Tree ident code)) section ident code docClose = do - ns <- S.gets parentSections + ns <- R.asks parentSections hashes <- lift $ P.try $ lit (replicate (head ns) '#') *> P.takeWhile1P Nothing (== '#') <* sp docClose title <- lift $ paragraph ident code docClose <* CP.space let m = length hashes + head ns body <- - local (\env -> env {parentSections = (m : (tail ns))}) $ + R.local (\env -> env {parentSections = m : tail ns}) $ P.many (sectionElem ident code docClose <* CP.space) pure $ Section (wrap' title) body From 96f865b37c4fcb31928d976ece21dbf3231d8e87 Mon Sep 17 00:00:00 2001 From: Greg Pfeil Date: Thu, 1 Aug 2024 22:53:33 -0600 Subject: [PATCH 22/22] Add a transcript showing that #5076 was fixed Some handling of blocks without final newlines was improved in the course of this PR. Fixes #5076. --- unison-src/transcripts/fix5076.md | 12 ++++++++++++ unison-src/transcripts/fix5076.output.md | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 unison-src/transcripts/fix5076.md create mode 100644 unison-src/transcripts/fix5076.output.md diff --git a/unison-src/transcripts/fix5076.md b/unison-src/transcripts/fix5076.md new file mode 100644 index 0000000000..d2c4b5a7b2 --- /dev/null +++ b/unison-src/transcripts/fix5076.md @@ -0,0 +1,12 @@ +```ucm:hide +scratch/main> builtins.mergeio +``` + +Nested call to code lexer wasn’t terminating inline examples containing blocks properly. + +```unison +x = {{ + ``let "me"`` live + ``do "me"`` in + }} +``` diff --git a/unison-src/transcripts/fix5076.output.md b/unison-src/transcripts/fix5076.output.md new file mode 100644 index 0000000000..f92954cd23 --- /dev/null +++ b/unison-src/transcripts/fix5076.output.md @@ -0,0 +1,22 @@ +Nested call to code lexer wasn’t terminating inline examples containing blocks properly. + +``` unison +x = {{ + ``let "me"`` live + ``do "me"`` in + }} +``` + +``` ucm + + Loading changes detected in scratch.u. + + I found and typechecked these definitions in scratch.u. If you + do an `add` or `update`, here's how your codebase would + change: + + ⍟ These new definitions are ok to `add`: + + x : Doc2 + +```