From 64fd7c5d0b1c5a6ffe5916d5970317827e0b71e1 Mon Sep 17 00:00:00 2001 From: Avik Sengupta Date: Sun, 2 Sep 2018 16:53:51 +0100 Subject: [PATCH 1/6] Prepare for 1.0 --- .travis.yml | 6 ++---- REQUIRE | 4 ++-- appveyor.yml | 45 ++++++++++++++++++++++++--------------------- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/.travis.yml b/.travis.yml index 396e0f0b..65941265 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,11 +2,9 @@ language: julia os: - linux julia: - - 0.6 + - 0.7 + - 1.0 notifications: email: false -script: - - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi - - julia -e 'Pkg.clone(pwd()); Pkg.build("TextAnalysis"); Pkg.test("TextAnalysis"; coverage=true)'; after_success: - julia -e 'cd(Pkg.dir("TextAnalysis")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'; diff --git a/REQUIRE b/REQUIRE index 48900650..9e4dd5e7 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,6 +1,6 @@ -julia 0.6 +julia 0.7 BinaryProvider -Languages 0.2.0 +Languages 0.4.0 DataFrames WordTokenizers Flux diff --git a/appveyor.yml b/appveyor.yml index a491b578..bdfd11ca 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,9 +1,18 @@ environment: matrix: - - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe" - - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe" -# - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe" -# - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe" + - julia_version: 0.7 + - julia_version: 1 + - julia_version: nightly + +platform: + - x86 # 32-bit + - x64 # 64-bit + +# # Uncomment the following lines to allow failures on nightly julia +# # (tests will run but not make your overall status red) +# matrix: +allow_failures: + - julia_version: nightly branches: only: @@ -17,24 +26,18 @@ notifications: on_build_status_changed: false install: - - ps: "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12" -# If there's a newer build queued for the same PR, cancel this one - - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod ` - https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | ` - Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { ` - throw "There are newer queued builds for this pull request, failing early." } -# Download most recent Julia Windows binary - - ps: (new-object net.webclient).DownloadFile( - $env:JULIA_URL, - "C:\projects\julia-binary.exe") -# Run installer silently, output to C:\projects\julia - - C:\projects\julia-binary.exe /S /D=C:\projects\julia + - ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1")) build_script: -# Need to convert from shallow to complete for Pkg.clone to work - - IF EXIST .git\shallow (git fetch --unshallow) - - C:\projects\julia\bin\julia -e "versioninfo(); - Pkg.clone(pwd(), \"TextAnalysis\"); Pkg.build(\"TextAnalysis\")" + - echo "%JL_BUILD_SCRIPT%" + - C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%" test_script: - - C:\projects\julia\bin\julia -e "Pkg.test(\"TextAnalysis\")" \ No newline at end of file + - echo "%JL_TEST_SCRIPT%" + - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%" + +# # Uncomment to support code coverage upload. Should only be enabled for packages +# # which would have coverage gaps without running on Windows +# on_success: +# - echo "%JL_CODECOV_SCRIPT%" +# - C:\julia\bin\julia -e "%JL_CODECOV_SCRIPT%" From b777d88fc53c57208935ac1b3b9d1e61e37e1f8c Mon Sep 17 00:00:00 2001 From: Fredrik Bagge Carlson Date: Wed, 5 Sep 2018 15:54:57 +0200 Subject: [PATCH 2/6] Change endof syntax (#98) * Change endof syntax * run femtocleaner --- docs/push-gh-pages.jl | 10 +++++----- src/corpus.jl | 12 ++++++------ src/deprecations.jl | 8 ++++---- src/document.jl | 24 ++++++++++++------------ src/dtm.jl | 18 +++++++++--------- src/hash.jl | 2 +- src/metadata.jl | 6 +++--- src/ngramizer.jl | 4 ++-- src/preprocessing.jl | 18 +++++++++--------- src/stemmer.jl | 16 ++++++++-------- src/tf_idf.jl | 26 +++++++++++++------------- src/tokenizer.jl | 4 ++-- test/runtests.jl | 2 +- 13 files changed, 75 insertions(+), 75 deletions(-) diff --git a/docs/push-gh-pages.jl b/docs/push-gh-pages.jl index 8469f6e9..680ed4c9 100644 --- a/docs/push-gh-pages.jl +++ b/docs/push-gh-pages.jl @@ -4,9 +4,9 @@ last_commit=readchomp(`git --no-pager log -1 --pretty=format:"%h:%s"`) -ENV["GIT_DIR"]=abspath(chomp(readstring(`git rev-parse --git-dir`))) +ENV["GIT_DIR"]=abspath(chomp(read(`git rev-parse --git-dir`, String))) -old_sha = chomp(readstring(`git rev-parse refs/remotes/origin/gh-pages`)) +old_sha = chomp(read(`git rev-parse refs/remotes/origin/gh-pages`, String)) #run(`julia make.jl`) @@ -16,13 +16,13 @@ cd("build") do ENV["GIT_INDEX_FILE"]=gif ENV["GIT_WORK_TREE"]=pwd() run(`git add -A`) - tsha=chomp(readstring(`git write-tree`)) + tsha=chomp(read(`git write-tree`, String)) mesg="Deploy docs for master@$last_commit" if length(old_sha) == 40 - csha = chomp(readstring(`git commit-tree $tsha -p $old_sha -m $(mesg)`)) + csha = chomp(read(`git commit-tree $tsha -p $old_sha -m $(mesg)`, String)) else - csha = chomp(readstring(`git commit-tree $tsha -m $(mesg)`)) + csha = chomp(read(`git commit-tree $tsha -m $(mesg)`, String)) end print("Created commit $csha") diff --git a/src/corpus.jl b/src/corpus.jl index 782e6302..e5083b92 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -45,7 +45,7 @@ function DirectoryCorpus(dirname::AbstractString) cd(dirname) for filename in readdir(".") - if isfile(filename) && !ismatch(r"^\.", filename) + if isfile(filename) && !occursin(r"^\.", filename) push!(docs, FileDocument(abspath(filename))) end if isdir(filename) && !islink(filename) @@ -115,8 +115,8 @@ Base.done(crps::Corpus, ind::Int) = ind > length(crps.documents) Base.push!(crps::Corpus, d::AbstractDocument) = push!(crps.documents, d) Base.pop!(crps::Corpus) = pop!(crps.documents) -Base.unshift!(crps::Corpus, d::AbstractDocument) = unshift!(crps.documents, d) -Base.shift!(crps::Corpus) = shift!(crps.documents) +Base.unshift!(crps::Corpus, d::AbstractDocument) = pushfirst!(crps.documents, d) +Base.shift!(crps::Corpus) = popfirst!(crps.documents) function Base.insert!(crps::Corpus, index::Int, d::AbstractDocument) insert!(crps.documents, index, d) @@ -133,8 +133,8 @@ Base.delete!(crps::Corpus, index::Integer) = delete!(crps.documents, index) ############################################################################## Base.getindex(crps::Corpus, ind::Real) = crps.documents[ind] -Base.getindex{T <: Real}(crps::Corpus, inds::Vector{T}) = crps.documents[inds] -Base.getindex(crps::Corpus, r::Range) = crps.documents[r] +Base.getindex(crps::Corpus, inds::Vector{T}) where {T <: Real} = crps.documents[inds] +Base.getindex(crps::Corpus, r::AbstractRange) = crps.documents[r] Base.getindex(crps::Corpus, term::AbstractString) = get(crps.inverse_index, term, Int[]) ############################################################################## @@ -226,7 +226,7 @@ hash_function!(crps::Corpus, f::TextHashFunction) = (crps.h = f; nothing) # ############################################################################## -function standardize!{T <: AbstractDocument}(crps::Corpus, ::Type{T}) +function standardize!(crps::Corpus, ::Type{T}) where T <: AbstractDocument for i in 1:length(crps) crps.documents[i] = convert(T, crps.documents[i]) end diff --git a/src/deprecations.jl b/src/deprecations.jl index cc747c6a..7aff72f6 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -1,22 +1,22 @@ ## Deprecations for Languages -function tokenize{S <: Language, T <: AbstractString}(::Type{S}, s::T) +function tokenize(::Type{S}, s::T) where {S <: Language, T <: AbstractString} depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S)) tokenize(S(), s) end -function ngramize{S <: Language, T <: AbstractString}(::Type{S}, words::Vector{T}, n::Int) +function ngramize(::Type{S}, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString} depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S)) ngramize(S(), words, n) end -function onegramize{S <: Language, T <: AbstractString}(::Type{S}, words::Vector{T}) +function onegramize(::Type{S}, words::Vector{T}) where {S <: Language, T <: AbstractString} depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S)) onegramize(S(), words) end -function stem_all{S <: Language}(stemmer::Stemmer, lang::Type{S}, sentence::AbstractString) +function stem_all(stemmer::Stemmer, lang::Type{S}, sentence::AbstractString) where S <: Language depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S)) stem_all(stemmer, S(), sentence) end diff --git a/src/document.jl b/src/document.jl index 15f9522a..aeaedaf5 100644 --- a/src/document.jl +++ b/src/document.jl @@ -4,7 +4,7 @@ # ############################################################################## -type DocumentMetadata +mutable struct DocumentMetadata language name::String author::String @@ -31,7 +31,7 @@ abstract type AbstractDocument; end # ############################################################################## -type FileDocument <: AbstractDocument +mutable struct FileDocument <: AbstractDocument filename::String metadata::DocumentMetadata end @@ -48,7 +48,7 @@ end # ############################################################################## -type StringDocument{T<:AbstractString} <: AbstractDocument +mutable struct StringDocument{T<:AbstractString} <: AbstractDocument text::T metadata::DocumentMetadata end @@ -61,14 +61,14 @@ StringDocument(txt::AbstractString) = StringDocument(txt, DocumentMetadata()) # ############################################################################## -type TokenDocument{T<:AbstractString} <: AbstractDocument +mutable struct TokenDocument{T<:AbstractString} <: AbstractDocument tokens::Vector{T} metadata::DocumentMetadata end function TokenDocument(txt::AbstractString, dm::DocumentMetadata) TokenDocument(tokenize(dm.language, String(txt)), dm) end -function TokenDocument{T <: AbstractString}(tkns::Vector{T}) +function TokenDocument(tkns::Vector{T}) where T <: AbstractString TokenDocument(tkns, DocumentMetadata()) end TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata()) @@ -79,7 +79,7 @@ TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata # ############################################################################## -type NGramDocument{T<:AbstractString} <: AbstractDocument +mutable struct NGramDocument{T<:AbstractString} <: AbstractDocument ngrams::Dict{T,Int} n::Int metadata::DocumentMetadata @@ -91,7 +91,7 @@ end function NGramDocument(txt::AbstractString, n::Integer=1) NGramDocument(txt, DocumentMetadata(), n) end -function NGramDocument{T <: AbstractString}(ng::Dict{T, Int}, n::Integer=1) +function NGramDocument(ng::Dict{T, Int}, n::Integer=1) where T <: AbstractString NGramDocument(merge(Dict{AbstractString,Int}(), ng), n, DocumentMetadata()) end @@ -103,12 +103,12 @@ end function text(fd::FileDocument) !isfile(fd.filename) && error("Can't find file: $(fd.filename)") - readstring(fd.filename) + read(fd.filename, String) end text(sd::StringDocument) = sd.text function text(td::TokenDocument) - warn("TokenDocument's can only approximate the original text") + @warn("TokenDocument's can only approximate the original text") join(td.tokens, " ") end function text(ngd::NGramDocument) @@ -132,8 +132,8 @@ function tokens(d::NGramDocument) error("The tokens of an NGramDocument cannot be reconstructed") end -tokens!{T <: AbstractString}(d::TokenDocument, new_tokens::Vector{T}) = (d.tokens = new_tokens) -function tokens!{T <: AbstractString}(d::AbstractDocument, new_tokens::Vector{T}) +tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T <: AbstractString} = (d.tokens = new_tokens) +function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where T <: AbstractString error("The tokens of a $(typeof(d)) cannot be directly edited") end @@ -199,7 +199,7 @@ const GenericDocument = Union{ ############################################################################## Document(str::AbstractString) = isfile(str) ? FileDocument(str) : StringDocument(str) -Document{T <: AbstractString}(tkns::Vector{T}) = TokenDocument(tkns) +Document(tkns::Vector{T}) where {T <: AbstractString} = TokenDocument(tkns) Document(ng::Dict{String, Int}) = NGramDocument(ng) ############################################################################## diff --git a/src/dtm.jl b/src/dtm.jl index f5a68fdf..2a9a5adc 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -4,7 +4,7 @@ # ############################################################################## -type DocumentTermMatrix +mutable struct DocumentTermMatrix dtm::SparseMatrixCSC{Int, Int} terms::Vector{String} column_indices::Dict{String, Int} @@ -32,9 +32,9 @@ function DocumentTermMatrix(crps::Corpus, terms::Vector{String}) m = length(crps) n = length(terms) - rows = Array{Int}(0) - columns = Array{Int}(0) - values = Array{Int}(0) + rows = Array{Int}(undef, 0) + columns = Array{Int}(undef, 0) + values = Array{Int}(undef, 0) for i in 1:m doc = crps.documents[i] ngs = ngrams(doc) @@ -57,7 +57,7 @@ function DocumentTermMatrix(crps::Corpus, terms::Vector{String}) end DocumentTermMatrix(crps::Corpus) = DocumentTermMatrix(crps, lexicon(crps)) -DocumentTermMatrix(crps::Corpus, lex::Associative) = DocumentTermMatrix(crps, sort(collect(keys(lex)))) +DocumentTermMatrix(crps::Corpus, lex::AbstractDict) = DocumentTermMatrix(crps, sort(collect(keys(lex)))) DocumentTermMatrix(dtm::SparseMatrixCSC{Int, Int},terms::Vector{String}) = DocumentTermMatrix(dtm, terms, columnindices(terms)) @@ -99,8 +99,8 @@ tdm(crps::Corpus) = dtm(crps)' #' function dtm_entries(d::AbstractDocument, lex::Dict{String, Int}) ngs = ngrams(d) - indices = Array{Int}(0) - values = Array{Int}(0) + indices = Array{Int}(undef, 0) + values = Array{Int}(undef, 0) terms = sort(collect(keys(lex))) column_indices = columnindices(terms) @@ -166,7 +166,7 @@ hash_tdm(crps::Corpus) = hash_dtm(crps)' #' # ############################################################################## -type EachDTV +mutable struct EachDTV crps::Corpus end @@ -178,7 +178,7 @@ end done(edt::EachDTV, state::Int) = state > length(edt.crps.documents) -type EachHashDTV +mutable struct EachHashDTV crps::Corpus end diff --git a/src/hash.jl b/src/hash.jl index 8429c5fa..1544ef63 100644 --- a/src/hash.jl +++ b/src/hash.jl @@ -18,7 +18,7 @@ # ############################################################################## -type TextHashFunction +mutable struct TextHashFunction hash_function::Function cardinality::Int end diff --git a/src/metadata.jl b/src/metadata.jl index 7960eac8..6e819481 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -15,7 +15,7 @@ function name!(d::AbstractDocument, nv::AbstractString) d.metadata.name = nv end -function language!{T <: Language}(d::AbstractDocument, nv::T) +function language!(d::AbstractDocument, nv::T) where T <: Language d.metadata.language = nv end @@ -39,7 +39,7 @@ authors(c::Corpus) = map(d -> author(d), documents(c)) timestamps(c::Corpus) = map(d -> timestamp(d), documents(c)) names!(c::Corpus, nv::AbstractString) = name!.(documents(c), nv) -languages!{T <: Language}(c::Corpus, nv::T) = language!.(documents(c), nv) +languages!(c::Corpus, nv::T) where {T <: Language} = language!.(documents(c), nv) authors!(c::Corpus, nv::AbstractString) = author!.(documents(c), nv) timestamps!(c::Corpus, nv::AbstractString) = timestamp!.(documents(c), nv) @@ -50,7 +50,7 @@ function names!(c::Corpus, nvs::Vector{String}) end end -function languages!{T <: Language}(c::Corpus, nvs::Vector{T}) +function languages!(c::Corpus, nvs::Vector{T}) where T <: Language length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) for (i, d) in enumerate(IndexLinear(), documents(c)) language!(d, nvs[i]) diff --git a/src/ngramizer.jl b/src/ngramizer.jl index 41390b71..057e5124 100644 --- a/src/ngramizer.jl +++ b/src/ngramizer.jl @@ -4,7 +4,7 @@ # ############################################################################## -function ngramize{S <: Language, T <: AbstractString}(lang::S, words::Vector{T}, n::Int) +function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString} (n == 1) && return onegramize(lang, words) n_words = length(words) @@ -21,7 +21,7 @@ function ngramize{S <: Language, T <: AbstractString}(lang::S, words::Vector{T}, return tokens end -function onegramize{S <: Language, T <: AbstractString}(lang::S, words::Vector{T}) +function onegramize(lang::S, words::Vector{T}) where {S <: Language, T <: AbstractString} n_words = length(words) tokens = Dict{T, Int}() diff --git a/src/preprocessing.jl b/src/preprocessing.jl index e8d7172d..33d2e411 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -43,7 +43,7 @@ end # ############################################################################## function remove_corrupt_utf8(s::AbstractString) - return map(x->UInt(x)!=0xfffd?x:' ' , s) + return map(x->UInt(x)!=0xfffd ? x : ' ' , s) end remove_corrupt_utf8!(d::FileDocument) = error("FileDocument cannot be modified") @@ -84,7 +84,7 @@ end # ############################################################################## -remove_case{T <: AbstractString}(s::T) = lowercase(s) +remove_case(s::T) where {T <: AbstractString} = lowercase(s) remove_case!(d::FileDocument) = error("FileDocument cannot be modified") @@ -151,8 +151,8 @@ end # Remove specified words # ############################################################################## -function remove_words!{T <: AbstractString}(entity::(Union{AbstractDocument,Corpus}), - words::Vector{T}) +function remove_words!(entity::(Union{AbstractDocument,Corpus}), + words::Vector{T}) where T <: AbstractString skipwords = Set{AbstractString}() union!(skipwords, words) prepare!(entity, strip_patterns, skip_words = skipwords) @@ -265,7 +265,7 @@ function remove_patterns(s::AbstractString, rex::Regex) String(take!(iob)) end -function remove_patterns{T <: String}(s::SubString{T}, rex::Regex) +function remove_patterns(s::SubString{T}, rex::Regex) where T <: String iob = IOBuffer() ioffset = s.offset data = Vector{UInt8}(s.string) @@ -320,9 +320,9 @@ end # internal helper methods _build_regex(lang, flags::UInt32) = _build_regex(lang, flags, Set{AbstractString}(), Set{AbstractString}()) -_build_regex{T <: AbstractString}(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) = _combine_regex(_build_regex_patterns(lang, flags, patterns, words)) +_build_regex(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) where {T <: AbstractString} = _combine_regex(_build_regex_patterns(lang, flags, patterns, words)) -function _combine_regex{T <: AbstractString}(regex_parts::Set{T}) +function _combine_regex(regex_parts::Set{T}) where T <: AbstractString l = length(regex_parts) (0 == l) && return r"" (1 == l) && return mk_regex(pop!(regex_parts)) @@ -335,7 +335,7 @@ function _combine_regex{T <: AbstractString}(regex_parts::Set{T}) mk_regex(String(take!(iob))) end -function _build_regex_patterns{T <: AbstractString}(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) +function _build_regex_patterns(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) where T <: AbstractString ((flags & strip_whitespace) > 0) && push!(patterns, "\\s+") if (flags & strip_non_letters) > 0 push!(patterns, "[^a-zA-Z\\s]") @@ -358,7 +358,7 @@ function _build_regex_patterns{T <: AbstractString}(lang, flags::UInt32, pattern patterns end -function _build_words_pattern{T <: AbstractString}(words::Vector{T}) +function _build_words_pattern(words::Vector{T}) where T <: AbstractString isempty(words) && return "" iob = IOBuffer() diff --git a/src/stemmer.jl b/src/stemmer.jl index 313a7ae1..24bba56e 100644 --- a/src/stemmer.jl +++ b/src/stemmer.jl @@ -25,14 +25,14 @@ function stemmer_types() stypes end -type Stemmer - cptr::Ptr{Void} +mutable struct Stemmer + cptr::Ptr{Cvoid} alg::String enc::String function Stemmer(stemmer_type, charenc=UTF_8) cptr = ccall((:sb_stemmer_new, libstemmer), - Ptr{Void}, + Ptr{Cvoid}, (Ptr{UInt8}, Ptr{UInt8}), String(stemmer_type), String(charenc)) @@ -54,7 +54,7 @@ show(io::IO, stm::Stemmer) = println(io, "Stemmer algorithm:$(stm.alg) encoding: function release(stm::Stemmer) (C_NULL == stm.cptr) && return - ccall((:sb_stemmer_delete, libstemmer), Void, (Ptr{Void},), stm.cptr) + ccall((:sb_stemmer_delete, libstemmer), Cvoid, (Ptr{Cvoid},), stm.cptr) stm.cptr = C_NULL nothing end @@ -65,20 +65,20 @@ function stem(stemmer::Stemmer, bstr::AbstractString) (Ptr{UInt8}, Ptr{UInt8}, Cint), stemmer.cptr, bstr, sizeof(bstr)) (C_NULL == sres) && error("error in stemming") - slen = ccall((:sb_stemmer_length, libstemmer), Cint, (Ptr{Void},), stemmer.cptr) - bytes = unsafe_wrap(Array, sres, Int(slen), false) + slen = ccall((:sb_stemmer_length, libstemmer), Cint, (Ptr{Cvoid},), stemmer.cptr) + bytes = unsafe_wrap(Array, sres, Int(slen), own=false) String(copy(bytes)) end -function stem_all{S <: Language}(stemmer::Stemmer, lang::S, sentence::AbstractString) +function stem_all(stemmer::Stemmer, lang::S, sentence::AbstractString) where S <: Language tokens = TextAnalysis.tokenize(lang, sentence) stemmed = stem(stemmer, tokens) join(stemmed, ' ') end function stem(stemmer::Stemmer, words::Array) - const l::Int = length(words) + l::Int = length(words) ret = Array{String}(l) for idx in 1:l ret[idx] = stem(stemmer, words[idx]) diff --git a/src/tf_idf.jl b/src/tf_idf.jl index f8a839e0..d1db3216 100644 --- a/src/tf_idf.jl +++ b/src/tf_idf.jl @@ -4,19 +4,19 @@ # ############################################################################## -tf{T <: Real}(dtm::Matrix{T}) = tf!(dtm, Array{Float64}(size(dtm)...)) +tf(dtm::Matrix{T}) where {T <: Real} = tf!(dtm, Array{Float64}(size(dtm)...)) -tf{T <: Real}(dtm::SparseMatrixCSC{T}) = tf!(dtm, similar(dtm, Float64)) +tf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, similar(dtm, Float64)) -tf!{T <: Real}(dtm::AbstractMatrix{T}) = tf!(dtm, dtm) +tf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf!(dtm, dtm) -tf!{T <: Real}(dtm::SparseMatrixCSC{T}) = tf!(dtm, dtm) +tf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, dtm) tf(dtm::DocumentTermMatrix) = tf(dtm.dtm) # The second Matrix will be overwritten with the result # Will work correctly if dtm and tfidf are the same matrix -function tf!{T1 <: Real, T2 <: AbstractFloat}(dtm::AbstractMatrix{T1}, tf::AbstractMatrix{T2}) +function tf!(dtm::AbstractMatrix{T1}, tf::AbstractMatrix{T2}) where {T1 <: Real, T2 <: AbstractFloat} n, p = size(dtm) # TF tells us what proportion of a document is defined by a term @@ -32,7 +32,7 @@ function tf!{T1 <: Real, T2 <: AbstractFloat}(dtm::AbstractMatrix{T1}, tf::Abstr end # assumes second matrix has same nonzeros as first one -function tf!{T <: Real, F <: AbstractFloat}(dtm::SparseMatrixCSC{T}, tf::SparseMatrixCSC{F}) +function tf!(dtm::SparseMatrixCSC{T}, tf::SparseMatrixCSC{F}) where {T <: Real, F <: AbstractFloat} rows = rowvals(dtm) dtmvals = nonzeros(dtm) tfvals = nonzeros(tf) @@ -57,13 +57,13 @@ end # ############################################################################## -tf_idf{T <: Real}(dtm::Matrix{T}) = tf_idf!(dtm, Array{Float64}(size(dtm)...)) +tf_idf(dtm::Matrix{T}) where {T <: Real} = tf_idf!(dtm, Array{Float64}(size(dtm)...)) -tf_idf{T <: Real}(dtm::SparseMatrixCSC{T}) = tf_idf!(dtm, similar(dtm, Float64)) +tf_idf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, similar(dtm, Float64)) -tf_idf!{T <: Real}(dtm::AbstractMatrix{T}) = tf_idf!(dtm, dtm) +tf_idf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf_idf!(dtm, dtm) -tf_idf!{T <: Real}(dtm::SparseMatrixCSC{T}) = tf_idf!(dtm, dtm) +tf_idf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, dtm) tf_idf(dtm::DocumentTermMatrix) = tf_idf(dtm.dtm) @@ -73,7 +73,7 @@ tf_idf(dtm::DocumentTermMatrix) = tf_idf(dtm.dtm) # The second Matrix will be overwritten with the result # Will work correctly if dtm and tfidf are the same matrix -function tf_idf!{T1 <: Real, T2 <: AbstractFloat}(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) +function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 <: Real, T2 <: AbstractFloat} n, p = size(dtm) # TF tells us what proportion of a document is defined by a term @@ -94,7 +94,7 @@ function tf_idf!{T1 <: Real, T2 <: AbstractFloat}(dtm::AbstractMatrix{T1}, tfidf end # sparse version -function tf_idf!{T <: Real, F <: AbstractFloat}(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}) +function tf_idf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}) where {T <: Real, F <: AbstractFloat} rows = rowvals(dtm) dtmvals = nonzeros(dtm) tfidfvals = nonzeros(tfidf) @@ -104,7 +104,7 @@ function tf_idf!{T <: Real, F <: AbstractFloat}(dtm::SparseMatrixCSC{T}, tfidf:: # TF tells us what proportion of a document is defined by a term words_in_documents = F.(sum(dtm,2)) - const oneval = one(F) + oneval = one(F) # IDF tells us how rare a term is in the corpus documents_containing_term = vec(sum(dtm .> 0, 1)) diff --git a/src/tokenizer.jl b/src/tokenizer.jl index 65eee345..044011f2 100644 --- a/src/tokenizer.jl +++ b/src/tokenizer.jl @@ -4,6 +4,6 @@ # ############################################################################## -tokenize{S <: Language, T <: AbstractString}(lang::S, s::T) = WordTokenizers.tokenize(s) +tokenize(lang::S, s::T) where {S <: Language, T <: AbstractString} = WordTokenizers.tokenize(s) -sentence_tokenize{S <: Language, T<:AbstractString}(lang::S, s::T) = WordTokenizers.split_sentences(s) +sentence_tokenize(lang::S, s::T) where {S <: Language, T<:AbstractString} = WordTokenizers.split_sentences(s) diff --git a/test/runtests.jl b/test/runtests.jl index a9c8ae54..04f704c8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,5 @@ module TestTextAnalysis -using Base.Test +using Test using Languages using TextAnalysis using Compat From 0485ff32b01bb4a5967daacfa331570e3b8b341e Mon Sep 17 00:00:00 2001 From: Fredrik Bagge Carlson Date: Wed, 5 Sep 2018 21:32:00 +0200 Subject: [PATCH 3/6] Patch 3 (#99) * Change endof syntax * run femtocleaner * Change endof again --- src/preprocessing.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/preprocessing.jl b/src/preprocessing.jl index 33d2e411..6f56f5cf 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -258,7 +258,7 @@ function remove_patterns(s::AbstractString, rex::Regex) Base.write_sub(iob, v, ibegin, len) write(iob, ' ') end - ibegin = nextind(s, m.endof+m.offset) + ibegin = nextind(s, endof(m)+m.offset) end len = length(v) - ibegin + 1 (len > 0) && Base.write_sub(iob, v, ibegin, len) @@ -276,9 +276,9 @@ function remove_patterns(s::SubString{T}, rex::Regex) where T <: String Base.write_sub(iob, data, ibegin+ioffset, len) write(iob, ' ') end - ibegin = nextind(s, m.endof+m.offset) + ibegin = nextind(s, endof(m)+m.offset) end - len = s.endof - ibegin + 1 + len = endof(s) - ibegin + 1 (len > 0) && Base.write_sub(iob, data, ibegin+ioffset, len) String(take!(iob)) end From 488145531f481cb2068b925078c7de473de450a2 Mon Sep 17 00:00:00 2001 From: Avik Sengupta Date: Wed, 5 Sep 2018 23:37:10 +0100 Subject: [PATCH 4/6] More fixes for 0.7 --- src/TextAnalysis.jl | 4 ++++ src/dtm.jl | 2 +- src/lda.jl | 8 ++++---- src/preprocessing.jl | 2 +- src/sentiment.jl | 2 +- src/stemmer.jl | 4 ++-- src/summarizer.jl | 4 ++-- src/tf_idf.jl | 12 ++++++------ test/lda.jl | 4 ++-- test/runtests.jl | 1 + 10 files changed, 24 insertions(+), 19 deletions(-) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 90ceb988..08127aa3 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -1,6 +1,10 @@ using DataFrames module TextAnalysis + using SparseArrays + using Printf + using LinearAlgebra + using Languages using DataFrames using WordTokenizers diff --git a/src/dtm.jl b/src/dtm.jl index 2a9a5adc..a64738ad 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -71,7 +71,7 @@ function dtm(d::DocumentTermMatrix, density::Symbol) if density == :sparse return d.dtm else - return full(d.dtm) + return Matrix(d.dtm) end end diff --git a/src/lda.jl b/src/lda.jl index d746763d..d1bc099a 100644 --- a/src/lda.jl +++ b/src/lda.jl @@ -37,8 +37,8 @@ Perform [Latent Dirichlet allocation](https://en.wikipedia.org/wiki/Latent_Diric function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int, alpha::Float64, beta::Float64) number_of_documents, number_of_words = size(dtm.dtm) - docs = Vector{Lda.TopicBasedDocument}(number_of_documents) - topics = Vector{Lda.Topic}(ntopics) + docs = Vector{Lda.TopicBasedDocument}(undef, number_of_documents) + topics = Vector{Lda.Topic}(undef, ntopics) for i in 1:ntopics topics[i] = Lda.Topic() end @@ -59,7 +59,7 @@ function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int, alpha::Float end docs[i] = topic_base_document end - probs = Vector{Float64}(ntopics) + probs = Vector{Float64}(undef, ntopics) # Gibbs sampling for _ in 1:iteration for doc in docs @@ -101,7 +101,7 @@ function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int, alpha::Float ϕ = spzeros(ntopics, number_of_words) θ = getfield.(docs, :topicidcount) θ = Float64.(hcat(θ...)) - θ ./= sum(θ, 1) + θ ./= sum(θ, dims=1) for topic in 1:ntopics t = topics[topic] for (word, count) in t.wordcount diff --git a/src/preprocessing.jl b/src/preprocessing.jl index 6f56f5cf..27a2d2c1 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -43,7 +43,7 @@ end # ############################################################################## function remove_corrupt_utf8(s::AbstractString) - return map(x->UInt(x)!=0xfffd ? x : ' ' , s) + return map(x->isvalid(x) ? x : ' ', s) end remove_corrupt_utf8!(d::FileDocument) = error("FileDocument cannot be modified") diff --git a/src/sentiment.jl b/src/sentiment.jl index 7d9f145c..56bc0aa7 100644 --- a/src/sentiment.jl +++ b/src/sentiment.jl @@ -30,7 +30,7 @@ end function flatten(x) l = prod(size(x)) - x = permutedims(x, reverse(range(1, ndims(x)))) + x = permutedims(x, reverse(range(1, length=ndims(x)))) return reshape(x, (l, 1)) end diff --git a/src/stemmer.jl b/src/stemmer.jl index 24bba56e..a1cc14fc 100644 --- a/src/stemmer.jl +++ b/src/stemmer.jl @@ -45,7 +45,7 @@ mutable struct Stemmer end stm = new(cptr, stemmer_type, charenc) - finalizer(stm, release) + finalizer(release, stm) stm end end @@ -79,7 +79,7 @@ end function stem(stemmer::Stemmer, words::Array) l::Int = length(words) - ret = Array{String}(l) + ret = Array{String}(undef, l) for idx in 1:l ret[idx] = stem(stemmer, words[idx]) end diff --git a/src/summarizer.jl b/src/summarizer.jl index 46a040ab..1179bafc 100644 --- a/src/summarizer.jl +++ b/src/summarizer.jl @@ -20,8 +20,8 @@ function pagerank( A; Niter=20, damping=.15) for i=1:Niter s = r * A - scale!(s, damping) - r = s .+ (a * sum(r,2)); # Compute PageRank. + rmul!(s, damping) + r = s .+ (a * sum(r, dims=2)); # Compute PageRank. end r = r./norm(r,1); diff --git a/src/tf_idf.jl b/src/tf_idf.jl index d1db3216..a7f142c3 100644 --- a/src/tf_idf.jl +++ b/src/tf_idf.jl @@ -4,7 +4,7 @@ # ############################################################################## -tf(dtm::Matrix{T}) where {T <: Real} = tf!(dtm, Array{Float64}(size(dtm)...)) +tf(dtm::Matrix{T}) where {T <: Real} = tf!(dtm, Array{Float64}(undef, size(dtm)...)) tf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, similar(dtm, Float64)) @@ -39,7 +39,7 @@ function tf!(dtm::SparseMatrixCSC{T}, tf::SparseMatrixCSC{F}) where {T <: Real, @assert size(dtmvals) == size(tfvals) # TF tells us what proportion of a document is defined by a term - words_in_documents = sum(dtm,2) + words_in_documents = sum(dtm,dims=2) n, p = size(dtm) for i = 1:p @@ -57,7 +57,7 @@ end # ############################################################################## -tf_idf(dtm::Matrix{T}) where {T <: Real} = tf_idf!(dtm, Array{Float64}(size(dtm)...)) +tf_idf(dtm::Matrix{T}) where {T <: Real} = tf_idf!(dtm, Array{Float64}(undef, size(dtm)...)) tf_idf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, similar(dtm, Float64)) @@ -80,7 +80,7 @@ function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 < tf!(dtm, tfidf) # IDF tells us how rare a term is in the corpus - documents_containing_term = vec(sum(dtm .> 0, 1)) + documents_containing_term = vec(sum(dtm .> 0, dims=1)) idf = log.(n ./ documents_containing_term) # TF-IDF is the product of TF and IDF @@ -103,11 +103,11 @@ function tf_idf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}) where {T <: n, p = size(dtm) # TF tells us what proportion of a document is defined by a term - words_in_documents = F.(sum(dtm,2)) + words_in_documents = F.(sum(dtm, dims=2)) oneval = one(F) # IDF tells us how rare a term is in the corpus - documents_containing_term = vec(sum(dtm .> 0, 1)) + documents_containing_term = vec(sum(dtm .> 0, dims=1)) idf = log.(n ./ documents_containing_term) for i = 1:p diff --git a/test/lda.jl b/test/lda.jl index bd63c2d7..7dbe560b 100644 --- a/test/lda.jl +++ b/test/lda.jl @@ -1,6 +1,6 @@ @testset "LDA" begin - + doc1 = "a a a sample text text" doc2 = "another example example text text" @@ -12,5 +12,5 @@ ϕ, θ = lda(dtm, 2, 25, 0.1, 0.1) @test ϕ isa SparseMatrixCSC @test θ isa Matrix{Float64} - @test all(sum(θ,1) .≈ 1) + @test all(sum(θ, dims=1) .≈ 1) end diff --git a/test/runtests.jl b/test/runtests.jl index 04f704c8..36b95ae8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,5 @@ module TestTextAnalysis +using SparseArrays using Test using Languages using TextAnalysis From 53ebfdf0196e7c4f0c822117c32229081a8c1c5f Mon Sep 17 00:00:00 2001 From: Fredrik Bagge Carlson Date: Sat, 29 Sep 2018 11:38:10 +0200 Subject: [PATCH 5/6] New iteration interface for Corpus --- src/corpus.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/corpus.jl b/src/corpus.jl index e5083b92..c0a57514 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -102,9 +102,10 @@ end # ############################################################################## -Base.start(crps::Corpus) = 1 -Base.next(crps::Corpus, ind::Int) = (crps.documents[ind], ind + 1) -Base.done(crps::Corpus, ind::Int) = ind > length(crps.documents) +function Base.iterate(crps::Corpus, ind=1) + ind > length(crps.documents) && return nothing + crps.documents[ind], ind+1 +end ############################################################################## # From dbf5bedbfb0deded552d2fcf764655ede0792da1 Mon Sep 17 00:00:00 2001 From: Avik Sengupta Date: Mon, 1 Oct 2018 11:02:40 +0100 Subject: [PATCH 6/6] More 1.0 fixes --- src/corpus.jl | 4 ++-- src/metadata.jl | 14 +++++++------- src/preprocessing.jl | 18 +++++++++--------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/corpus.jl b/src/corpus.jl index c0a57514..bd94f410 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -116,8 +116,8 @@ end Base.push!(crps::Corpus, d::AbstractDocument) = push!(crps.documents, d) Base.pop!(crps::Corpus) = pop!(crps.documents) -Base.unshift!(crps::Corpus, d::AbstractDocument) = pushfirst!(crps.documents, d) -Base.shift!(crps::Corpus) = popfirst!(crps.documents) +Base.pushfirst!(crps::Corpus, d::AbstractDocument) = pushfirst!(crps.documents, d) +Base.popfirst!(crps::Corpus) = popfirst!(crps.documents) function Base.insert!(crps::Corpus, index::Int, d::AbstractDocument) insert!(crps.documents, index, d) diff --git a/src/metadata.jl b/src/metadata.jl index 6e819481..5ba94a4a 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -39,34 +39,34 @@ authors(c::Corpus) = map(d -> author(d), documents(c)) timestamps(c::Corpus) = map(d -> timestamp(d), documents(c)) names!(c::Corpus, nv::AbstractString) = name!.(documents(c), nv) -languages!(c::Corpus, nv::T) where {T <: Language} = language!.(documents(c), nv) -authors!(c::Corpus, nv::AbstractString) = author!.(documents(c), nv) -timestamps!(c::Corpus, nv::AbstractString) = timestamp!.(documents(c), nv) +languages!(c::Corpus, nv::T) where {T <: Language} = language!.(documents(c), Ref(nv)) #Ref to force scalar broadcast +authors!(c::Corpus, nv::AbstractString) = author!.(documents(c), Ref(nv)) +timestamps!(c::Corpus, nv::AbstractString) = timestamp!.(documents(c), Ref(nv)) function names!(c::Corpus, nvs::Vector{String}) length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) - for (i, d) in enumerate(IndexLinear(), documents(c)) + for (i, d) in pairs(IndexLinear(), documents(c)) name!(d, nvs[i]) end end function languages!(c::Corpus, nvs::Vector{T}) where T <: Language length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) - for (i, d) in enumerate(IndexLinear(), documents(c)) + for (i, d) in pairs(IndexLinear(), documents(c)) language!(d, nvs[i]) end end function authors!(c::Corpus, nvs::Vector{String}) length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) - for (i, d) in enumerate(IndexLinear(), documents(c)) + for (i, d) in pairs(IndexLinear(), documents(c)) author!(d, nvs[i]) end end function timestamps!(c::Corpus, nvs::Vector{String}) length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) - for (i, d) in enumerate(IndexLinear(), documents(c)) + for (i, d) in pairs(IndexLinear(), documents(c)) timestamp!(d, nvs[i]) end end diff --git a/src/preprocessing.jl b/src/preprocessing.jl index 27a2d2c1..e77736a1 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -251,14 +251,14 @@ end function remove_patterns(s::AbstractString, rex::Regex) iob = IOBuffer() ibegin = 1 - v=Vector{UInt8}(s) - for m in matchall(rex, s) - len = m.offset-ibegin+1 + v=codeunits(s) + for m in eachmatch(rex, s) + len = m.match.offset-ibegin+1 if len > 0 Base.write_sub(iob, v, ibegin, len) write(iob, ' ') end - ibegin = nextind(s, endof(m)+m.offset) + ibegin = nextind(s, lastindex(m.match)+m.match.offset) end len = length(v) - ibegin + 1 (len > 0) && Base.write_sub(iob, v, ibegin, len) @@ -268,17 +268,17 @@ end function remove_patterns(s::SubString{T}, rex::Regex) where T <: String iob = IOBuffer() ioffset = s.offset - data = Vector{UInt8}(s.string) + data = codeunits(s.string) ibegin = 1 - for m in matchall(rex, s) - len = m.offset-ibegin+1 + for m in eachmatch(rex, s) + len = m.match.offset-ibegin+1 if len > 0 Base.write_sub(iob, data, ibegin+ioffset, len) write(iob, ' ') end - ibegin = nextind(s, endof(m)+m.offset) + ibegin = nextind(s, lastindex(m.match)+m.match.offset) end - len = endof(s) - ibegin + 1 + len = lastindex(s) - ibegin + 1 (len > 0) && Base.write_sub(iob, data, ibegin+ioffset, len) String(take!(iob)) end