From 6198e0de8970374ab12495750ca6742a8a6cd0b3 Mon Sep 17 00:00:00 2001 From: Petr Shumilov Date: Fri, 6 Sep 2024 19:12:03 +0300 Subject: [PATCH] Add str_getcsv builtin support (#1096) Signed-off-by: Petr Shumilov --- builtin-functions/kphp-full/_functions.txt | 2 + runtime/streams.cpp | 94 ++++++++++--------- runtime/streams.h | 3 + runtime/string_functions.cpp | 40 ++++++++ runtime/string_functions.h | 3 + .../phpt/string_functions/011_str_getcsv.php | 27 ++++++ tests/zend-test-list | 3 + 7 files changed, 127 insertions(+), 45 deletions(-) create mode 100644 tests/phpt/string_functions/011_str_getcsv.php diff --git a/builtin-functions/kphp-full/_functions.txt b/builtin-functions/kphp-full/_functions.txt index 7846b8c4b0..48b2d84f79 100644 --- a/builtin-functions/kphp-full/_functions.txt +++ b/builtin-functions/kphp-full/_functions.txt @@ -739,6 +739,8 @@ function rtrim ($s ::: string, $what ::: string = " \n\r\t\v\0") ::: string; function xor_strings ($s ::: string, $t ::: string) ::: string; function similar_text ($first ::: string, $second ::: string, float &$percent = TODO) ::: int; +function str_getcsv($str ::: string, string $delimiter ::: string = ",", string $enclosure ::: string = "\"", string $escape ::: string = "\\") ::: mixed[] | false; + function extension_loaded(string $extension): bool; function ctype_alnum(mixed $text): bool; diff --git a/runtime/streams.cpp b/runtime/streams.cpp index f6a5a2e7e5..3a9c47ad0d 100644 --- a/runtime/streams.cpp +++ b/runtime/streams.cpp @@ -12,8 +12,6 @@ #include "runtime/allocator.h" #include "runtime/critical_section.h" -constexpr int PHP_CSV_NO_ESCAPE = EOF; - static string::size_type max_wrapper_name_size = 0; static array wrappers; @@ -505,43 +503,15 @@ static const char *fgetcsv_lookup_trailing_spaces(const char *ptr, size_t len) { return ptr; } - -Optional> f$fgetcsv(const Stream &stream, int64_t length, string delimiter, string enclosure, string escape) { - if (delimiter.empty()) { - php_warning("delimiter must be a character"); - return false; - } else if (delimiter.size() > 1) { - php_warning("delimiter must be a single character"); - } - if (enclosure.empty()) { - php_warning("enclosure must be a character"); - return false; - } else if (enclosure.size() > 1) { - php_warning("enclosure must be a single character"); - } - int escape_char = PHP_CSV_NO_ESCAPE; - if (!escape.empty()) { - escape_char = static_cast(escape[0]); - } else if (escape.size() > 1) { - php_warning("escape_char must be a single character"); - } - char delimiter_char = delimiter[0]; - char enclosure_char = enclosure[0]; - if (length < 0) { - php_warning("Length parameter may not be negative"); - return false; - } else if (length == 0) { - length = -1; - } - Optional buf_optional = length < 0 ? f$fgets(stream) : f$fgets(stream, length + 1); - if (!buf_optional.has_value()) { - return false; - } - string buffer = buf_optional.val(); +// Common csv-parsing functionality for +// * fgetcsv +// * str_getcsv +// The function is similar to `php_fgetcsv` function from https://github.com/php/php-src/blob/master/ext/standard/file.c +Optional> getcsv(const Stream &stream, string buffer, char delimiter, char enclosure, char escape) { array answer; int current_id = 0; string_buffer tmp_buffer; - // this part is imported from https://github.com/php/php-src/blob/master/ext/standard/file.c, function php_fgetcsv + // Following part is imported from `php_fgetcsv` char const *buf = buffer.c_str(); char const *bptr = buf; size_t buf_len = buffer.size(); @@ -557,10 +527,10 @@ Optional> f$fgetcsv(const Stream &stream, int64_t length, string de inc_len = (bptr < limit ? (*bptr == '\0' ? 1 : mblen(bptr, limit - bptr)) : 0); if (inc_len == 1) { char const *tmp = bptr; - while ((*tmp != delimiter_char) && isspace((int)*(unsigned char *)tmp)) { + while ((*tmp != delimiter) && isspace((int)*(unsigned char *)tmp)) { tmp++; } - if (*tmp == enclosure_char) { + if (*tmp == enclosure) { bptr = tmp; } } @@ -571,7 +541,7 @@ Optional> f$fgetcsv(const Stream &stream, int64_t length, string de } first_field = false; /* 2. Read field, leaving bptr pointing at start of next field */ - if (inc_len != 0 && *bptr == enclosure_char) { + if (inc_len != 0 && *bptr == enclosure) { int state = 0; bptr++; /* move on to first character in field */ @@ -641,7 +611,7 @@ Optional> f$fgetcsv(const Stream &stream, int64_t length, string de state = 0; break; case 2: /* embedded enclosure ? let's check it */ - if (*bptr != enclosure_char) { + if (*bptr != enclosure) { /* real enclosure */ tmp_buffer.append(hunk_begin, static_cast(bptr - hunk_begin - 1)); hunk_begin = bptr; @@ -653,9 +623,9 @@ Optional> f$fgetcsv(const Stream &stream, int64_t length, string de state = 0; break; default: - if (*bptr == enclosure_char) { + if (*bptr == enclosure) { state = 2; - } else if (escape_char != PHP_CSV_NO_ESCAPE && *bptr == escape_char) { + } else if (escape != PHP_CSV_NO_ESCAPE && *bptr == escape) { state = 1; } bptr++; @@ -697,7 +667,7 @@ Optional> f$fgetcsv(const Stream &stream, int64_t length, string de inc_len = 1; /* fallthrough */ case 1: - if (*bptr == delimiter_char) { + if (*bptr == delimiter) { goto quit_loop_3; } break; @@ -725,7 +695,7 @@ Optional> f$fgetcsv(const Stream &stream, int64_t length, string de inc_len = 1; /* fallthrough */ case 1: - if (*bptr == delimiter_char) { + if (*bptr == delimiter) { goto quit_loop_4; } break; @@ -740,7 +710,7 @@ Optional> f$fgetcsv(const Stream &stream, int64_t length, string de char const *comp_end = (char *)fgetcsv_lookup_trailing_spaces(tmp_buffer.c_str(), tmp_buffer.size()); tmp_buffer.set_pos(comp_end - tmp_buffer.c_str()); - if (*bptr == delimiter_char) { + if (*bptr == delimiter) { bptr++; } } @@ -753,6 +723,40 @@ Optional> f$fgetcsv(const Stream &stream, int64_t length, string de return answer; } +Optional> f$fgetcsv(const Stream &stream, int64_t length, string delimiter, string enclosure, string escape) { + if (delimiter.empty()) { + php_warning("delimiter must be a character"); + return false; + } else if (delimiter.size() > 1) { + php_warning("delimiter must be a single character"); + } + if (enclosure.empty()) { + php_warning("enclosure must be a character"); + return false; + } else if (enclosure.size() > 1) { + php_warning("enclosure must be a single character"); + } + int escape_char = PHP_CSV_NO_ESCAPE; + if (!escape.empty()) { + escape_char = static_cast(escape[0]); + } else if (escape.size() > 1) { + php_warning("escape_char must be a single character"); + } + char delimiter_char = delimiter[0]; + char enclosure_char = enclosure[0]; + if (length < 0) { + php_warning("Length parameter may not be negative"); + return false; + } else if (length == 0) { + length = -1; + } + Optional buf_optional = length < 0 ? f$fgets(stream) : f$fgets(stream, length + 1); + if (!buf_optional.has_value()) { + return false; + } + return getcsv(stream, buf_optional.val(), delimiter_char, enclosure_char, escape_char); +} + Optional f$file_get_contents(const string &stream) { STREAM_FUNCTION_BODY(file_get_contents, false)(url); } diff --git a/runtime/streams.h b/runtime/streams.h index 1f1ccc6b6f..1ae579423a 100644 --- a/runtime/streams.h +++ b/runtime/streams.h @@ -15,6 +15,7 @@ constexpr int64_t STREAM_SET_READ_BUFFER_OPTION = 2; constexpr int64_t FILE_APPEND = 1; +constexpr int PHP_CSV_NO_ESCAPE = EOF; struct stream_functions { string name; @@ -89,6 +90,8 @@ Optional f$vfprintf(const Stream &stream, const string &format, const a Optional f$fputcsv(const Stream &stream, const array &fields, string delimiter = string(",", 1), string enclosure = string("\"", 1), string escape_char = string("\\", 1)); +Optional> getcsv(const Stream &stream, string buffer, char delimiter, char enclosure, char escape); + Optional> f$fgetcsv(const Stream &stream, int64_t length = 0, string delimiter = string(",", 1), string enclosure = string("\"", 1), string escape_char = string("\\", 1)); diff --git a/runtime/string_functions.cpp b/runtime/string_functions.cpp index d61931bec0..18756a4512 100644 --- a/runtime/string_functions.cpp +++ b/runtime/string_functions.cpp @@ -14,6 +14,9 @@ #include "runtime/context/runtime-context.h" #include "runtime/interface.h" +// For "f$str_getcsv" support +#include "runtime/streams.h" + const string COLON(",", 1); const string CP1251("cp1251"); const string DOT(".", 1); @@ -2950,3 +2953,40 @@ string str_concat(str_concat_arg s1, str_concat_arg s2, str_concat_arg s3, str_c auto new_size = s1.size + s2.size + s3.size + s4.size + s5.size; return string(new_size, true).append_unsafe(s1.as_tmp_string()).append_unsafe(s2.as_tmp_string()).append_unsafe(s3.as_tmp_string()).append_unsafe(s4.as_tmp_string()).append_unsafe(s5.as_tmp_string()).finish_append(); } + +// Based on `getcsv` from `streams` +Optional> f$str_getcsv(const string &str, const string &delimiter, const string &enclosure, const string &escape) { + char delimiter_char = ','; + char enclosure_char = '"'; + char escape_char = PHP_CSV_NO_ESCAPE; + /* + * By PHP Manual: delimiter, enclosure, escape -- one single-byte character only + * We make it a warning + * Since PHP 8.3.11 it should return false + */ + const auto del_size = delimiter.size(); + if (del_size > 1) { + php_warning("Delimiter must be a single character"); + } + if (del_size != 0) { + delimiter_char = delimiter[0]; + } + + const auto enc_size = enclosure.size(); + if (enc_size > 1) { + php_warning("Enclosure must be a single character"); + } + if (enc_size != 0) { + enclosure_char = enclosure[0]; + } + + const auto esc_size = escape.size(); + if (esc_size > 1) { + php_warning("Escape must be a single character"); + } + if (esc_size != 0) { + escape_char = escape[0]; + } + + return getcsv(mixed() /* null */, str, delimiter_char, enclosure_char, escape_char); +} diff --git a/runtime/string_functions.h b/runtime/string_functions.h index a0f13df801..b7e3cc2995 100644 --- a/runtime/string_functions.h +++ b/runtime/string_functions.h @@ -263,6 +263,9 @@ string f$vsprintf(const string &format, const array &args); string f$wordwrap(const string &str, int64_t width = 75, const string &brk = NEW_LINE, bool cut = false); +Optional> f$str_getcsv(const string &s, const string &delimiter = string(1, ','), + const string &enclosure = string(1, '\"'), const string &escape = string(1, '\\')); + /* * * IMPLEMENTATION diff --git a/tests/phpt/string_functions/011_str_getcsv.php b/tests/phpt/string_functions/011_str_getcsv.php new file mode 100644 index 0000000000..4dfc2ebd93 --- /dev/null +++ b/tests/phpt/string_functions/011_str_getcsv.php @@ -0,0 +1,27 @@ +@ok + 2 +// not 1 <=> 3 +var_dump(str_getcsv($s2, ",", "*")); // 1 +var_dump(str_getcsv($s2, ",", "*", "\\")); // 2 +var_dump(str_getcsv($s2, ",", "*", "")); // 3 + + diff --git a/tests/zend-test-list b/tests/zend-test-list index bb6c9b4bf1..8b5c776c2a 100644 --- a/tests/zend-test-list +++ b/tests/zend-test-list @@ -659,6 +659,9 @@ ext/standard/tests/strings/vsprintf_basic8.phpt ext/standard/tests/strings/vsprintf_basic9.phpt ext/standard/tests/strings/wordwrap_basic.phpt ext/standard/tests/strings/wordwrap_variation5.phpt +ext/standard/tests/strings/str_getcsv_001.phpt +ext/standard/tests/strings/str_getcsv_002.phpt +ext/standard/tests/strings/bug55674.phpt ext/standard/tests/url/base64_decode_basic_001.phpt ext/standard/tests/url/base64_decode_basic_002.phpt ext/standard/tests/url/base64_encode_basic_001.phpt