diff --git a/TODO.md b/TODO.md index 49dad11..620230f 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,8 @@ # TODO ## Copy to -- [ ] header -- [ ] sheet +- [x] header +- [x] sheet - [ ] types - [ ] implicit copy to when it sees a gsheets url - [x] warn when more than 2048 rows @@ -19,5 +19,5 @@ - [ ] Service Account keyfile ## Tests -- Tests for read_gsheet() -- Tests for copy to \ No newline at end of file +- [x] Tests for read_gsheet() +- [x] Tests for copy to diff --git a/src/gsheets_copy.cpp b/src/gsheets_copy.cpp index 38bf147..1eda39a 100644 --- a/src/gsheets_copy.cpp +++ b/src/gsheets_copy.cpp @@ -21,26 +21,12 @@ namespace duckdb copy_to_sink = GSheetWriteSink; } - struct GSheetCopyGlobalState : public GlobalFunctionData - { - explicit GSheetCopyGlobalState(ClientContext &context, const string &sheet_id, const string &token, const string &sheet_name) - : sheet_id(sheet_id), token(token), sheet_name(sheet_name) - { - } - - public: - string sheet_id; - string token; - string sheet_name; - }; - - struct GSheetWriteBindData : public TableFunctionData - { - }; unique_ptr GSheetCopyFunction::GSheetWriteBind(ClientContext &context, CopyFunctionBindInput &input, const vector &names, const vector &sql_types) { - return make_uniq(); + string file_path = input.info.file_path; + + return make_uniq(file_path, sql_types, names); } unique_ptr GSheetCopyFunction::GSheetWriteInitializeGlobal(ClientContext &context, FunctionData &bind_data, const string &file_path) @@ -69,14 +55,19 @@ namespace duckdb } std::string token = token_value.ToString(); + std::string spreadsheet_id = extract_spreadsheet_id(file_path); std::string sheet_id = extract_sheet_id(file_path); - std::string sheet_name = "Sheet1"; // TODO: make this configurable + std::string sheet_name = "Sheet1"; + + sheet_name = get_sheet_name_from_id(spreadsheet_id, sheet_id, token); + + std::string encoded_sheet_name = url_encode(sheet_name); // If writing, clear out the entire sheet first. // Do this here in the initialization so that it only happens once - std::string response = delete_sheet_data(sheet_id, token, sheet_name); + std::string response = delete_sheet_data(spreadsheet_id, token, encoded_sheet_name); - return make_uniq(context, sheet_id, token, sheet_name); + return make_uniq(context, spreadsheet_id, token, encoded_sheet_name); } unique_ptr GSheetCopyFunction::GSheetWriteInitializeLocal(ExecutionContext &context, FunctionData &bind_data_p) @@ -89,16 +80,23 @@ namespace duckdb input.Flatten(); auto &gstate = gstate_p.Cast(); + std::string sheet_id = extract_sheet_id(bind_data_p.Cast().files[0]); + + std::string sheet_name = "Sheet1"; + + sheet_name = get_sheet_name_from_id(gstate.spreadsheet_id, sheet_id, gstate.token); + std::string encoded_sheet_name = url_encode(sheet_name); // Create object ready to write to Google Sheet json sheet_data; - // TODO: make this configurable - sheet_data["range"] = "Sheet1"; + sheet_data["range"] = sheet_name; sheet_data["majorDimension"] = "ROWS"; - // TODO: Add column headers + vector headers = bind_data_p.Cast().options.name_list; vector> values; + values.push_back(headers); + for (idx_t r = 0; r < input.size(); r++) { vector row; @@ -135,7 +133,7 @@ namespace duckdb // Make the API call to write data to the Google Sheet // Today, this is only append. - std::string response = fetch_sheet_data(gstate.sheet_id, gstate.token, gstate.sheet_name, HttpMethod::POST, request_body); + std::string response = call_sheets_api(gstate.spreadsheet_id, gstate.token, encoded_sheet_name, HttpMethod::POST, request_body); // Check for errors in the response json response_json = parseJson(response); diff --git a/src/gsheets_read.cpp b/src/gsheets_read.cpp index 22ae90a..1002b9c 100644 --- a/src/gsheets_read.cpp +++ b/src/gsheets_read.cpp @@ -10,9 +10,9 @@ namespace duckdb { using json = nlohmann::json; -ReadSheetBindData::ReadSheetBindData(string sheet_id, string token, bool header, string sheet_name) - : sheet_id(sheet_id), token(token), finished(false), row_index(0), header(header), sheet_name(sheet_name) { - response = fetch_sheet_data(sheet_id, token, sheet_name, HttpMethod::GET); +ReadSheetBindData::ReadSheetBindData(string spreadsheet_id, string token, bool header, string sheet_name) + : spreadsheet_id(spreadsheet_id), token(token), finished(false), row_index(0), header(header), sheet_name(sheet_name) { + response = call_sheets_api(spreadsheet_id, token, sheet_name, HttpMethod::GET); } @@ -81,23 +81,10 @@ unique_ptr ReadSheetBind(ClientContext &context, TableFunctionBind // Default values bool header = true; - string sheet = "Sheet1"; + string sheet_name = "Sheet1"; - // Parse named parameters - for (auto &kv : input.named_parameters) { - if (kv.first == "header") { - try { - header = kv.second.GetValue(); - } catch (const std::exception& e) { - throw InvalidInputException("Invalid value for 'header' parameter. Expected a boolean value."); - } - } else if (kv.first == "sheet") { - sheet = kv.second.GetValue(); - } - } - - // Extract the sheet ID from the input (URL or ID) - std::string sheet_id = extract_sheet_id(sheet_input); + // Extract the spreadsheet ID from the input (URL or ID) + std::string spreadsheet_id = extract_spreadsheet_id(sheet_input); // Use the SecretManager to get the token auto &secret_manager = SecretManager::Get(context); @@ -125,7 +112,26 @@ unique_ptr ReadSheetBind(ClientContext &context, TableFunctionBind std::string token = token_value.ToString(); - auto bind_data = make_uniq(sheet_id, token, header, sheet); + // Get sheet name from URL + std::string sheet_id = extract_sheet_id(sheet_input); + sheet_name = get_sheet_name_from_id(spreadsheet_id, sheet_id, token); + + // Parse named parameters + for (auto &kv : input.named_parameters) { + if (kv.first == "header") { + try { + header = kv.second.GetValue(); + } catch (const std::exception& e) { + throw InvalidInputException("Invalid value for 'header' parameter. Expected a boolean value."); + } + } else if (kv.first == "sheet") { + sheet_name = kv.second.GetValue(); + } + } + + std::string encoded_sheet_name = url_encode(sheet_name); + + auto bind_data = make_uniq(spreadsheet_id, token, header, encoded_sheet_name); json cleanJson = parseJson(bind_data->response); SheetData sheet_data = getSheetData(cleanJson); diff --git a/src/gsheets_requests.cpp b/src/gsheets_requests.cpp index ec21e59..2ac154a 100644 --- a/src/gsheets_requests.cpp +++ b/src/gsheets_requests.cpp @@ -62,6 +62,8 @@ namespace duckdb request += body; } + + if (BIO_write(bio, request.c_str(), request.length()) <= 0) { BIO_free_all(bio); @@ -89,10 +91,10 @@ namespace duckdb return response; } - std::string fetch_sheet_data(const std::string &sheet_id, const std::string &token, const std::string &sheet_name, HttpMethod method, const std::string &body) + std::string call_sheets_api(const std::string &spreadsheet_id, const std::string &token, const std::string &sheet_name, HttpMethod method, const std::string &body) { std::string host = "sheets.googleapis.com"; - std::string path = "/v4/spreadsheets/" + sheet_id + "/values/" + sheet_name; + std::string path = "/v4/spreadsheets/" + spreadsheet_id + "/values/" + sheet_name; if (method == HttpMethod::POST) { path += ":append"; @@ -102,11 +104,18 @@ namespace duckdb return perform_https_request(host, path, token, method, body); } - std::string delete_sheet_data(const std::string &sheet_id, const std::string &token, const std::string &sheet_name) + std::string delete_sheet_data(const std::string &spreadsheet_id, const std::string &token, const std::string &sheet_name) { std::string host = "sheets.googleapis.com"; - std::string path = "/v4/spreadsheets/" + sheet_id + "/values/" + sheet_name + ":clear"; + std::string path = "/v4/spreadsheets/" + spreadsheet_id + "/values/" + sheet_name + ":clear"; return perform_https_request(host, path, token, HttpMethod::POST, "{}"); } + + std::string get_spreadsheet_metadata(const std::string &spreadsheet_id, const std::string &token) + { + std::string host = "sheets.googleapis.com"; + std::string path = "/v4/spreadsheets/" + spreadsheet_id + "?&fields=sheets.properties"; + return perform_https_request(host, path, token, HttpMethod::GET, ""); + } } diff --git a/src/gsheets_utils.cpp b/src/gsheets_utils.cpp index dcbd28b..978acbc 100644 --- a/src/gsheets_utils.cpp +++ b/src/gsheets_utils.cpp @@ -1,24 +1,26 @@ #include "gsheets_utils.hpp" +#include "gsheets_requests.hpp" #include "duckdb/common/exception.hpp" #include #include #include +#include using json = nlohmann::json; namespace duckdb { -std::string extract_sheet_id(const std::string& input) { +std::string extract_spreadsheet_id(const std::string& input) { // Check if the input is already a sheet ID (no slashes) if (input.find('/') == std::string::npos) { return input; } - // Regular expression to match the sheet ID in a Google Sheets URL + // Regular expression to match the spreadsheet ID in a Google Sheets URL if(input.find("docs.google.com/spreadsheets/d/") != std::string::npos) { - std::regex sheet_id_regex("/d/([a-zA-Z0-9-_]+)"); + std::regex spreadsheet_id_regex("/d/([a-zA-Z0-9-_]+)"); std::smatch match; - if (std::regex_search(input, match, sheet_id_regex) && match.size() > 1) { + if (std::regex_search(input, match, spreadsheet_id_regex) && match.size() > 1) { return match.str(1); } } @@ -26,6 +28,39 @@ std::string extract_sheet_id(const std::string& input) { throw duckdb::InvalidInputException("Invalid Google Sheets URL or ID"); } +std::string extract_sheet_id(const std::string& input) { + if (input.find("docs.google.com/spreadsheets/d/") != std::string::npos && input.find("gid=") != std::string::npos) { + std::regex sheet_id_regex("gid=([0-9]+)"); + std::smatch match; + if (std::regex_search(input, match, sheet_id_regex) && match.size() > 1) { + return match.str(1); + } + } + return "0"; +} + +std::string get_sheet_name_from_id(const std::string& spreadsheet_id, const std::string& sheet_id, const std::string& token) { + std::string metadata_response = get_spreadsheet_metadata(spreadsheet_id, token); + json metadata = parseJson(metadata_response); + for (const auto& sheet : metadata["sheets"]) { + if (sheet["properties"]["sheetId"].get() == std::stoi(sheet_id)) { + return sheet["properties"]["title"].get(); + } + } + throw duckdb::InvalidInputException("Sheet with ID %s not found", sheet_id); +} + +std::string get_sheet_id_from_name(const std::string& spreadsheet_id, const std::string& sheet_name, const std::string& token) { + std::string metadata_response = get_spreadsheet_metadata(spreadsheet_id, token); + json metadata = parseJson(metadata_response); + for (const auto& sheet : metadata["sheets"]) { + if (sheet["properties"]["title"].get() == sheet_name) { + return sheet["properties"]["sheetId"].get(); + } + } + throw duckdb::InvalidInputException("Sheet with name %s not found", sheet_name); +} + json parseJson(const std::string& json_str) { try { // Find the start of the JSON object @@ -88,4 +123,18 @@ std::string generate_random_string(size_t length) { return result; } -} // namespace duckdb +std::string url_encode(const std::string& str) { + std::string encoded; + for (char c : str) { + if (isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') { + encoded += c; + } else { + std::stringstream ss; + ss << std::hex << std::uppercase << static_cast(static_cast(c)); + encoded += '%' + ss.str(); + } + } + return encoded; +} + +} // namespace duckdb \ No newline at end of file diff --git a/src/include/gsheets_copy.hpp b/src/include/gsheets_copy.hpp index d762b08..09121fb 100644 --- a/src/include/gsheets_copy.hpp +++ b/src/include/gsheets_copy.hpp @@ -6,6 +6,37 @@ namespace duckdb { + struct GSheetCopyGlobalState : public GlobalFunctionData + { + explicit GSheetCopyGlobalState(ClientContext &context, const string &spreadsheet_id, const string &token, const string &sheet_name) + : spreadsheet_id(spreadsheet_id), token(token), sheet_name(sheet_name) + { + } + + public: + string spreadsheet_id; + string token; + string sheet_name; + }; + + struct GSheetWriteOptions + { + vector name_list; + }; + + struct GSheetWriteBindData : public TableFunctionData + { + vector files; + GSheetWriteOptions options; + vector sql_types; + + GSheetWriteBindData(string file_path, vector sql_types, vector names) + : sql_types(std::move(sql_types)) + { + files.push_back(std::move(file_path)); + options.name_list = std::move(names); + } + }; class GSheetCopyFunction : public CopyFunction { diff --git a/src/include/gsheets_read.hpp b/src/include/gsheets_read.hpp index 8dd8047..39c47f6 100644 --- a/src/include/gsheets_read.hpp +++ b/src/include/gsheets_read.hpp @@ -8,7 +8,7 @@ namespace duckdb { struct ReadSheetBindData : public TableFunctionData { - string sheet_id; + string spreadsheet_id; string token; bool finished; idx_t row_index; @@ -16,7 +16,7 @@ struct ReadSheetBindData : public TableFunctionData { bool header; string sheet_name; - ReadSheetBindData(string sheet_id, string token, bool header, string sheet_name); + ReadSheetBindData(string spreadsheet_id, string token, bool header, string sheet_name); }; void ReadSheetFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output); diff --git a/src/include/gsheets_requests.hpp b/src/include/gsheets_requests.hpp index 4e6ec4a..8a487f5 100644 --- a/src/include/gsheets_requests.hpp +++ b/src/include/gsheets_requests.hpp @@ -13,8 +13,9 @@ enum class HttpMethod { std::string perform_https_request(const std::string& host, const std::string& path, const std::string& token, HttpMethod method = HttpMethod::GET, const std::string& body = "", const std::string& content_type = "application/json"); -std::string fetch_sheet_data(const std::string& sheet_id, const std::string& token, const std::string& sheet_name, HttpMethod method = HttpMethod::GET, const std::string& body = ""); +std::string call_sheets_api(const std::string& spreadsheet_id, const std::string& token, const std::string& sheet_name, HttpMethod method = HttpMethod::GET, const std::string& body = ""); -std::string delete_sheet_data(const std::string& sheet_id, const std::string& token, const std::string& sheet_name); +std::string delete_sheet_data(const std::string& spreadsheet_id, const std::string& token, const std::string& sheet_name); +std::string get_spreadsheet_metadata(const std::string& spreadsheet_id, const std::string& token); } \ No newline at end of file diff --git a/src/include/gsheets_utils.hpp b/src/include/gsheets_utils.hpp index 8541547..7f6ce19 100644 --- a/src/include/gsheets_utils.hpp +++ b/src/include/gsheets_utils.hpp @@ -15,8 +15,34 @@ namespace duckdb { * @return The extracted sheet ID * @throws InvalidInputException if the input is neither a valid URL nor a sheet ID */ +std::string extract_spreadsheet_id(const std::string& input); + + +/** + * Extracts the sheet ID from a Google Sheets URL + * @param input A Google Sheets URL + * @return The extracted sheet ID + */ std::string extract_sheet_id(const std::string& input); +/** + * Gets the sheet name from a spreadsheet ID and sheet ID + * @param spreadsheet_id The spreadsheet ID + * @param sheet_id The sheet ID + * @param token The Google API token + * @return The sheet name + */ +std::string get_sheet_name_from_id(const std::string& spreadsheet_id, const std::string& sheet_id, const std::string& token); + +/** + * Gets the sheet ID from a spreadsheet ID and sheet name + * @param spreadsheet_id The spreadsheet ID + * @param sheet_name The sheet name + * @param token The Google API token + * @return The sheet ID + */ +std::string get_sheet_id_from_name(const std::string& spreadsheet_id, const std::string& sheet_name, const std::string& token); + struct SheetData { std::string range; std::string majorDimension; @@ -25,6 +51,11 @@ struct SheetData { SheetData getSheetData(const json& j); +/** + * Parses a JSON string into a json object + * @param json_str The JSON string + * @return The parsed json object + */ json parseJson(const std::string& json_str); /** @@ -34,4 +65,12 @@ json parseJson(const std::string& json_str); */ std::string generate_random_string(size_t length); -} // namespace duckdb + +/** + * Encodes a string to be used in a URL + * @param str The string to encode + * @return The encoded string + */ +std::string url_encode(const std::string& str); + +} // namespace duckdb \ No newline at end of file diff --git a/test/sql/copy_to.test b/test/sql/copy_to.test new file mode 100644 index 0000000..6cc7108 --- /dev/null +++ b/test/sql/copy_to.test @@ -0,0 +1,35 @@ +# name: test/sql/copy_to.test +# description: test COPY TO function +# group: [gsheets] + +require-env TOKEN + +require gsheets + +# Create a secret NB must substitute a token, do not commit! +statement ok +create secret test_secret (type gsheet, token '${TOKEN}'); + +# Create a table to copy to Google Sheet +statement ok +create table spreadsheets as +select 'Microsoft' as company, 'Excel' as product, 1985 as year_founded +union all +select 'Google', 'Google Sheets', 2006 +union all +select 'Apple', 'Numbers', 1984 +union all +select 'LibreOffice', 'Calc', 2000; + +# Copy the table to Google Sheet +statement ok +copy spreadsheets to 'https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=1295634987#gid=1295634987' (format gsheet); + +# Read the table from Google Sheet +query III +from read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=1295634987#gid=1295634987'); +---- +Microsoft Excel 1985 +Google Google Sheets 2006 +Apple Numbers 1984 +LibreOffice Calc 2000 diff --git a/test/sql/gsheets.test b/test/sql/read_gsheet.test similarity index 95% rename from test/sql/gsheets.test rename to test/sql/read_gsheet.test index 0aecd3b..9c4fcf2 100644 --- a/test/sql/gsheets.test +++ b/test/sql/read_gsheet.test @@ -1,5 +1,5 @@ -# name: test/sql/gsheets.test -# description: test gsheets extension +# name: test/sql/read_gsheet.test +# description: test read_gsheet() function # group: [gsheets] require-env TOKEN