Skip to content

Commit

Permalink
add feature download all wiki
Browse files Browse the repository at this point in the history
  • Loading branch information
wangzhankun authored and Wsine committed Aug 27, 2024
1 parent c3ad27a commit ab9846e
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 9 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ app
node_modules
.next
.vercel
.vscode/**
27 changes: 19 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,17 @@

$ feishu2md dl -h
NAME:
feishu2md download - Download feishu/larksuite document to markdown file

feishu2md download - Download feishu/larksuite document to markdown file
USAGE:
feishu2md download [command options] <url>

feishu2md download [command options] <url>
OPTIONS:
--output value, -o value Specify the output directory for the markdown files (default: "./")
--dump Dump json response of the OPEN API (default: false)
--batch Download all documents under a folder (default: false)
--help, -h show help (default: false)
--output value, -o value Specify the output directory for the markdown files (default: "./")
--dump Dump json response of the OPEN API (default: false)
--batch Download all documents under a folder (default: false)
--wiki Download all documents within the wiki. (default: false)
--help, -h show help (default: false)

```
Expand Down Expand Up @@ -117,6 +118,16 @@
$ feishu2md dl --batch -o output_directory "https://domain.feishu.cn/drive/folder/foldertoken"
```
**批量下载某知识库的全部文档为 Markdown**
通过`feishu2md dl --wiki <your feishu wiki setting url>` 直接下载,wiki settings链接可以通过 打开知识库设置获得。
示例:
```bash
$ feishu2md dl --wiki -o output_directory "https://domain.feishu.cn/wiki/settings/123456789101112"
```
</details>
<details>
Expand Down
83 changes: 83 additions & 0 deletions cmd/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ type DownloadOpts struct {
outputDir string
dump bool
batch bool
wiki bool
}

var dlOpts = DownloadOpts{}
Expand All @@ -35,6 +36,9 @@ func downloadDocument(ctx context.Context, client *core.Client, url string, opts
// for a wiki page, we need to renew docType and docToken first
if docType == "wiki" {
node, err := client.GetWikiNodeInfo(ctx, docToken)
if err != nil {
err = fmt.Errorf("GetWikiNodeInfo err: %v for %v", err, url)
}
utils.CheckErr(err)
docType = node.ObjType
docToken = node.ObjToken
Expand Down Expand Up @@ -165,6 +169,81 @@ func downloadDocuments(ctx context.Context, client *core.Client, url string) err
return nil
}

func downloadWiki(ctx context.Context, client *core.Client, url string) error {
prefixURL, spaceID, err := utils.ValidateWikiURL(url)
if err != nil {
return err
}

folderPath, err := client.GetWikiName(ctx, spaceID)
if err != nil {
return err
}
if folderPath == "" {
return fmt.Errorf("failed to GetWikiName")
}

errChan := make(chan error)

var maxConcurrency = 10 // Set the maximum concurrency level
wg := sync.WaitGroup{}
semaphore := make(chan struct{}, maxConcurrency) // Create a semaphore with the maximum concurrency level

var downloadWikiNode func(ctx context.Context,
client *core.Client,
spaceID string,
parentPath string,
parentNodeToken *string) error

downloadWikiNode = func(ctx context.Context,
client *core.Client,
spaceID string,
folderPath string,
parentNodeToken *string) error {
nodes, err := client.GetWikiNodeList(ctx, spaceID, parentNodeToken)
if err != nil {
return err
}
for _, n := range nodes {
if n.HasChild {
_folderPath := filepath.Join(folderPath, n.Title)
if err := downloadWikiNode(ctx, client,
spaceID, _folderPath, &n.NodeToken); err != nil {
return err
}
}
if n.ObjType == "docx" {
opts := DownloadOpts{outputDir: folderPath, dump: dlOpts.dump, batch: false}
wg.Add(1)
semaphore <- struct{}{}
go func(_url string) {
if err := downloadDocument(ctx, client, _url, &opts); err != nil {
errChan <- err
}
wg.Done()
<-semaphore
}(prefixURL + "/wiki/" + n.NodeToken)
// downloadDocument(ctx, client, prefixURL+"/wiki/"+n.NodeToken, &opts)
}
}
return nil
}

if err = downloadWikiNode(ctx, client, spaceID, folderPath, nil); err != nil {
return err
}

// Wait for all the downloads to finish
go func() {
wg.Wait()
close(errChan)
}()
for err := range errChan {
return err
}
return nil
}

func handleDownloadCommand(url string) error {
// Load config
configPath, err := core.GetConfigFilePath()
Expand All @@ -187,5 +266,9 @@ func handleDownloadCommand(url string) error {
return downloadDocuments(ctx, client, url)
}

if dlOpts.wiki {
return downloadWiki(ctx, client, url)
}

return downloadDocument(ctx, client, url, &dlOpts)
}
8 changes: 7 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,17 @@ func main() {
Usage: "Download all documents under a folder",
Destination: &dlOpts.batch,
},
&cli.BoolFlag{
Name: "wiki",
Value: false,
Usage: "Download all documents within the wiki.",
Destination: &dlOpts.wiki,
},
},
ArgsUsage: "<url>",
Action: func(ctx *cli.Context) error {
if ctx.NArg() == 0 {
return cli.Exit("Please specify the document/folder url", 1)
return cli.Exit("Please specify the document/folder/wiki url", 1)
} else {
url := ctx.Args().First()
return handleDownloadCommand(url)
Expand Down
44 changes: 44 additions & 0 deletions core/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,47 @@ func (c *Client) GetDriveFolderFileList(ctx context.Context, pageToken *string,
}
return files, nil
}

func (c *Client) GetWikiName(ctx context.Context, spaceID string) (string, error) {
resp, _, err := c.larkClient.Drive.GetWikiSpace(ctx, &lark.GetWikiSpaceReq{
SpaceID: spaceID,
})

if err != nil {
return "", err
}

return resp.Space.Name, nil
}

func (c *Client) GetWikiNodeList(ctx context.Context, spaceID string, parentNodeToken *string) ([]*lark.GetWikiNodeListRespItem, error) {
resp, _, err := c.larkClient.Drive.GetWikiNodeList(ctx, &lark.GetWikiNodeListReq{
SpaceID: spaceID,
PageSize: nil,
PageToken: nil,
ParentNodeToken: parentNodeToken,
})

if err != nil {
return nil, err
}

nodes := resp.Items

for resp.HasMore {
resp, _, err := c.larkClient.Drive.GetWikiNodeList(ctx, &lark.GetWikiNodeListReq{
SpaceID: spaceID,
PageSize: nil,
PageToken: nil,
ParentNodeToken: parentNodeToken,
})

if err != nil {
return nil, err
}

nodes = append(nodes, resp.Items...)
}

return nodes, nil
}
13 changes: 13 additions & 0 deletions core/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,16 @@ func TestGetDriveFolderFileList(t *testing.T) {
t.Errorf("Error: no files found")
}
}

func TestGetWikiNodeList(t *testing.T) {
appID, appSecret := getIdAndSecretFromEnv(t)
c := core.NewClient(appID, appSecret)
wikiToken := "7376995595006787612"
nodes, err := c.GetWikiNodeList(context.Background(), wikiToken, nil)
if err != nil {
t.Error(err)
}
if len(nodes) == 0 {
t.Errorf("Error: no nodes found")
}
}
12 changes: 12 additions & 0 deletions utils/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,15 @@ func ValidateFolderURL(url string) (string, error) {
folderToken := matchResult[1]
return folderToken, nil
}

func ValidateWikiURL(url string) (string, string, error) {
// reg := regexp.MustCompile("^https://[\\w-.]+/wiki/settings/([a-zA-Z0-9]+)")
reg := regexp.MustCompile(`^(https://[\w-.]+)/wiki/settings/([a-zA-Z0-9]+)$`)
matchResult := reg.FindStringSubmatch(url)
if matchResult == nil || len(matchResult) != 3 {
return "", "", errors.Errorf("Invalid feishu/larksuite folder URL pattern")
}
prefixURL := matchResult[1]
wikiToken := matchResult[2]
return prefixURL, wikiToken, nil
}
46 changes: 46 additions & 0 deletions utils/url_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,49 @@ func TestValidateDownloadURL(t *testing.T) {
})
}
}

func TestValidWikiURL(t *testing.T) {
tests := []struct {
name string
url string
prefix string
token string
noErr bool
}{
{
name: "valid wiki setting success",
url: "",
prefix: "",
token: "",
noErr: false,
},
{
name: "validate docs url failed",
url: "https://sample.sg.larksuite.com/wiki/doccnByZP6puODElAYySJkPIfUb",
prefix: "",
token: "",
noErr: false,
},
{
name: "validate feishu url failed",
url: "https://sample.feishu.cn/docx/doccnByZP6puODElAYySJkPIfUb",
prefix: "",
token: "",
noErr: false,
},
{
name: "validate larksuite wiki settings success",
url: "https://sample.sg.larksuite.com/wiki/settings/doccnByZP6puODElAYySJkPIfUb",
prefix: "https://sample.sg.larksuite.com",
token: "doccnByZP6puODElAYySJkPIfUb",
noErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if prefix, token, got := ValidateWikiURL(tt.url); (got == nil) != tt.noErr || prefix != tt.prefix || token != tt.token {
t.Errorf("ValidateWikiURL(%v) = %v, %v; want prefix = %v, want token = %v", tt.url, prefix, token, tt.prefix, tt.token)
}
})
}
}

0 comments on commit ab9846e

Please sign in to comment.