Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use sheet IDs in URL for read_gsheet() and COPY TO #29

Merged
merged 7 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# TODO

## Copy to
- [ ] header
- [ ] sheet
- [x] header
- [x] sheet
- [ ] types
- [ ] implicit copy to when it sees a gsheets url
- [x] warn when more than 2048 rows
Expand All @@ -19,5 +19,5 @@
- [ ] Service Account keyfile

## Tests
- Tests for read_gsheet()
- Tests for copy to
- [x] Tests for read_gsheet()
- [x] Tests for copy to
46 changes: 22 additions & 24 deletions src/gsheets_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,12 @@ namespace duckdb
copy_to_sink = GSheetWriteSink;
}

struct GSheetCopyGlobalState : public GlobalFunctionData
{
explicit GSheetCopyGlobalState(ClientContext &context, const string &sheet_id, const string &token, const string &sheet_name)
: sheet_id(sheet_id), token(token), sheet_name(sheet_name)
{
}

public:
string sheet_id;
string token;
string sheet_name;
};

struct GSheetWriteBindData : public TableFunctionData
{
};

unique_ptr<FunctionData> GSheetCopyFunction::GSheetWriteBind(ClientContext &context, CopyFunctionBindInput &input, const vector<string> &names, const vector<LogicalType> &sql_types)
{
return make_uniq<GSheetWriteBindData>();
string file_path = input.info.file_path;

return make_uniq<GSheetWriteBindData>(file_path, sql_types, names);
}

unique_ptr<GlobalFunctionData> GSheetCopyFunction::GSheetWriteInitializeGlobal(ClientContext &context, FunctionData &bind_data, const string &file_path)
Expand Down Expand Up @@ -69,14 +55,19 @@ namespace duckdb
}

std::string token = token_value.ToString();
std::string spreadsheet_id = extract_spreadsheet_id(file_path);
std::string sheet_id = extract_sheet_id(file_path);
std::string sheet_name = "Sheet1"; // TODO: make this configurable
std::string sheet_name = "Sheet1";

sheet_name = get_sheet_name_from_id(spreadsheet_id, sheet_id, token);

std::string encoded_sheet_name = url_encode(sheet_name);

// If writing, clear out the entire sheet first.
// Do this here in the initialization so that it only happens once
std::string response = delete_sheet_data(sheet_id, token, sheet_name);
std::string response = delete_sheet_data(spreadsheet_id, token, encoded_sheet_name);

return make_uniq<GSheetCopyGlobalState>(context, sheet_id, token, sheet_name);
return make_uniq<GSheetCopyGlobalState>(context, spreadsheet_id, token, encoded_sheet_name);
}

unique_ptr<LocalFunctionData> GSheetCopyFunction::GSheetWriteInitializeLocal(ExecutionContext &context, FunctionData &bind_data_p)
Expand All @@ -89,16 +80,23 @@ namespace duckdb
input.Flatten();
auto &gstate = gstate_p.Cast<GSheetCopyGlobalState>();

std::string sheet_id = extract_sheet_id(bind_data_p.Cast<GSheetWriteBindData>().files[0]);

std::string sheet_name = "Sheet1";

sheet_name = get_sheet_name_from_id(gstate.spreadsheet_id, sheet_id, gstate.token);
std::string encoded_sheet_name = url_encode(sheet_name);
// Create object ready to write to Google Sheet
json sheet_data;

// TODO: make this configurable
sheet_data["range"] = "Sheet1";
sheet_data["range"] = sheet_name;
sheet_data["majorDimension"] = "ROWS";

// TODO: Add column headers
vector<string> headers = bind_data_p.Cast<GSheetWriteBindData>().options.name_list;

vector<vector<string>> values;
values.push_back(headers);

for (idx_t r = 0; r < input.size(); r++)
{
vector<string> row;
Expand Down Expand Up @@ -135,7 +133,7 @@ namespace duckdb

// Make the API call to write data to the Google Sheet
// Today, this is only append.
std::string response = fetch_sheet_data(gstate.sheet_id, gstate.token, gstate.sheet_name, HttpMethod::POST, request_body);
std::string response = call_sheets_api(gstate.spreadsheet_id, gstate.token, encoded_sheet_name, HttpMethod::POST, request_body);

// Check for errors in the response
json response_json = parseJson(response);
Expand Down
46 changes: 26 additions & 20 deletions src/gsheets_read.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ namespace duckdb {

using json = nlohmann::json;

ReadSheetBindData::ReadSheetBindData(string sheet_id, string token, bool header, string sheet_name)
: sheet_id(sheet_id), token(token), finished(false), row_index(0), header(header), sheet_name(sheet_name) {
response = fetch_sheet_data(sheet_id, token, sheet_name, HttpMethod::GET);
ReadSheetBindData::ReadSheetBindData(string spreadsheet_id, string token, bool header, string sheet_name)
: spreadsheet_id(spreadsheet_id), token(token), finished(false), row_index(0), header(header), sheet_name(sheet_name) {
response = call_sheets_api(spreadsheet_id, token, sheet_name, HttpMethod::GET);
}


Expand Down Expand Up @@ -81,23 +81,10 @@ unique_ptr<FunctionData> ReadSheetBind(ClientContext &context, TableFunctionBind

// Default values
bool header = true;
string sheet = "Sheet1";
string sheet_name = "Sheet1";

// Parse named parameters
for (auto &kv : input.named_parameters) {
if (kv.first == "header") {
try {
header = kv.second.GetValue<bool>();
} catch (const std::exception& e) {
throw InvalidInputException("Invalid value for 'header' parameter. Expected a boolean value.");
}
} else if (kv.first == "sheet") {
sheet = kv.second.GetValue<string>();
}
}

// Extract the sheet ID from the input (URL or ID)
std::string sheet_id = extract_sheet_id(sheet_input);
// Extract the spreadsheet ID from the input (URL or ID)
std::string spreadsheet_id = extract_spreadsheet_id(sheet_input);

// Use the SecretManager to get the token
auto &secret_manager = SecretManager::Get(context);
Expand Down Expand Up @@ -125,7 +112,26 @@ unique_ptr<FunctionData> ReadSheetBind(ClientContext &context, TableFunctionBind

std::string token = token_value.ToString();

auto bind_data = make_uniq<ReadSheetBindData>(sheet_id, token, header, sheet);
// Get sheet name from URL
std::string sheet_id = extract_sheet_id(sheet_input);
sheet_name = get_sheet_name_from_id(spreadsheet_id, sheet_id, token);

// Parse named parameters
for (auto &kv : input.named_parameters) {
if (kv.first == "header") {
try {
header = kv.second.GetValue<bool>();
} catch (const std::exception& e) {
throw InvalidInputException("Invalid value for 'header' parameter. Expected a boolean value.");
}
} else if (kv.first == "sheet") {
sheet_name = kv.second.GetValue<string>();
}
}

std::string encoded_sheet_name = url_encode(sheet_name);

auto bind_data = make_uniq<ReadSheetBindData>(spreadsheet_id, token, header, encoded_sheet_name);

json cleanJson = parseJson(bind_data->response);
SheetData sheet_data = getSheetData(cleanJson);
Expand Down
17 changes: 13 additions & 4 deletions src/gsheets_requests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ namespace duckdb
request += body;
}



if (BIO_write(bio, request.c_str(), request.length()) <= 0)
{
BIO_free_all(bio);
Expand Down Expand Up @@ -89,10 +91,10 @@ namespace duckdb
return response;
}

std::string fetch_sheet_data(const std::string &sheet_id, const std::string &token, const std::string &sheet_name, HttpMethod method, const std::string &body)
std::string call_sheets_api(const std::string &spreadsheet_id, const std::string &token, const std::string &sheet_name, HttpMethod method, const std::string &body)
{
std::string host = "sheets.googleapis.com";
std::string path = "/v4/spreadsheets/" + sheet_id + "/values/" + sheet_name;
std::string path = "/v4/spreadsheets/" + spreadsheet_id + "/values/" + sheet_name;

if (method == HttpMethod::POST) {
path += ":append";
Expand All @@ -102,11 +104,18 @@ namespace duckdb
return perform_https_request(host, path, token, method, body);
}

std::string delete_sheet_data(const std::string &sheet_id, const std::string &token, const std::string &sheet_name)
std::string delete_sheet_data(const std::string &spreadsheet_id, const std::string &token, const std::string &sheet_name)
{
std::string host = "sheets.googleapis.com";
std::string path = "/v4/spreadsheets/" + sheet_id + "/values/" + sheet_name + ":clear";
std::string path = "/v4/spreadsheets/" + spreadsheet_id + "/values/" + sheet_name + ":clear";

return perform_https_request(host, path, token, HttpMethod::POST, "{}");
}

std::string get_spreadsheet_metadata(const std::string &spreadsheet_id, const std::string &token)
{
std::string host = "sheets.googleapis.com";
std::string path = "/v4/spreadsheets/" + spreadsheet_id + "?&fields=sheets.properties";
return perform_https_request(host, path, token, HttpMethod::GET, "");
}
}
59 changes: 54 additions & 5 deletions src/gsheets_utils.cpp
Original file line number Diff line number Diff line change
@@ -1,31 +1,66 @@
#include "gsheets_utils.hpp"
#include "gsheets_requests.hpp"
#include "duckdb/common/exception.hpp"
#include <regex>
#include <json.hpp>
#include <iostream>
#include <sstream>

using json = nlohmann::json;
namespace duckdb {

std::string extract_sheet_id(const std::string& input) {
std::string extract_spreadsheet_id(const std::string& input) {
// Check if the input is already a sheet ID (no slashes)
if (input.find('/') == std::string::npos) {
return input;
}

// Regular expression to match the sheet ID in a Google Sheets URL
// Regular expression to match the spreadsheet ID in a Google Sheets URL
if(input.find("docs.google.com/spreadsheets/d/") != std::string::npos) {
std::regex sheet_id_regex("/d/([a-zA-Z0-9-_]+)");
std::regex spreadsheet_id_regex("/d/([a-zA-Z0-9-_]+)");
std::smatch match;

if (std::regex_search(input, match, sheet_id_regex) && match.size() > 1) {
if (std::regex_search(input, match, spreadsheet_id_regex) && match.size() > 1) {
return match.str(1);
}
}

throw duckdb::InvalidInputException("Invalid Google Sheets URL or ID");
}

std::string extract_sheet_id(const std::string& input) {
if (input.find("docs.google.com/spreadsheets/d/") != std::string::npos && input.find("gid=") != std::string::npos) {
std::regex sheet_id_regex("gid=([0-9]+)");
std::smatch match;
if (std::regex_search(input, match, sheet_id_regex) && match.size() > 1) {
return match.str(1);
}
}
return "0";
}

std::string get_sheet_name_from_id(const std::string& spreadsheet_id, const std::string& sheet_id, const std::string& token) {
std::string metadata_response = get_spreadsheet_metadata(spreadsheet_id, token);
json metadata = parseJson(metadata_response);
for (const auto& sheet : metadata["sheets"]) {
if (sheet["properties"]["sheetId"].get<int>() == std::stoi(sheet_id)) {
return sheet["properties"]["title"].get<std::string>();
}
}
throw duckdb::InvalidInputException("Sheet with ID %s not found", sheet_id);
}

std::string get_sheet_id_from_name(const std::string& spreadsheet_id, const std::string& sheet_name, const std::string& token) {
std::string metadata_response = get_spreadsheet_metadata(spreadsheet_id, token);
json metadata = parseJson(metadata_response);
for (const auto& sheet : metadata["sheets"]) {
if (sheet["properties"]["title"].get<std::string>() == sheet_name) {
return sheet["properties"]["sheetId"].get<std::string>();
}
}
throw duckdb::InvalidInputException("Sheet with name %s not found", sheet_name);
}

json parseJson(const std::string& json_str) {
try {
// Find the start of the JSON object
Expand Down Expand Up @@ -88,4 +123,18 @@ std::string generate_random_string(size_t length) {
return result;
}

} // namespace duckdb
std::string url_encode(const std::string& str) {
std::string encoded;
for (char c : str) {
if (isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') {
encoded += c;
} else {
std::stringstream ss;
ss << std::hex << std::uppercase << static_cast<int>(static_cast<unsigned char>(c));
encoded += '%' + ss.str();
}
}
return encoded;
}

} // namespace duckdb
31 changes: 31 additions & 0 deletions src/include/gsheets_copy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,37 @@

namespace duckdb
{
struct GSheetCopyGlobalState : public GlobalFunctionData
{
explicit GSheetCopyGlobalState(ClientContext &context, const string &spreadsheet_id, const string &token, const string &sheet_name)
: spreadsheet_id(spreadsheet_id), token(token), sheet_name(sheet_name)
{
}

public:
string spreadsheet_id;
string token;
string sheet_name;
};

struct GSheetWriteOptions
{
vector<string> name_list;
};

struct GSheetWriteBindData : public TableFunctionData
{
vector<string> files;
GSheetWriteOptions options;
vector<LogicalType> sql_types;

GSheetWriteBindData(string file_path, vector<LogicalType> sql_types, vector<string> names)
: sql_types(std::move(sql_types))
{
files.push_back(std::move(file_path));
options.name_list = std::move(names);
}
};

class GSheetCopyFunction : public CopyFunction
{
Expand Down
4 changes: 2 additions & 2 deletions src/include/gsheets_read.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
namespace duckdb {

struct ReadSheetBindData : public TableFunctionData {
string sheet_id;
string spreadsheet_id;
string token;
bool finished;
idx_t row_index;
string response;
bool header;
string sheet_name;

ReadSheetBindData(string sheet_id, string token, bool header, string sheet_name);
ReadSheetBindData(string spreadsheet_id, string token, bool header, string sheet_name);
};

void ReadSheetFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output);
Expand Down
5 changes: 3 additions & 2 deletions src/include/gsheets_requests.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ enum class HttpMethod {
std::string perform_https_request(const std::string& host, const std::string& path, const std::string& token,
HttpMethod method = HttpMethod::GET, const std::string& body = "", const std::string& content_type = "application/json");

std::string fetch_sheet_data(const std::string& sheet_id, const std::string& token, const std::string& sheet_name, HttpMethod method = HttpMethod::GET, const std::string& body = "");
std::string call_sheets_api(const std::string& spreadsheet_id, const std::string& token, const std::string& sheet_name, HttpMethod method = HttpMethod::GET, const std::string& body = "");

std::string delete_sheet_data(const std::string& sheet_id, const std::string& token, const std::string& sheet_name);
std::string delete_sheet_data(const std::string& spreadsheet_id, const std::string& token, const std::string& sheet_name);

std::string get_spreadsheet_metadata(const std::string& spreadsheet_id, const std::string& token);
}
Loading
Loading