From c478c5ead144c1dbbec3042f1b6f29796f8a883a Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 12 Jun 2019 10:14:05 +0530 Subject: [PATCH 01/14] Add PreprocessBuffer --- src/TextAnalysis.jl | 4 ++++ src/fastpreprocess.jl | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 src/fastpreprocess.jl diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index c2f88311..ff9a7a71 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -60,6 +60,9 @@ module TextAnalysis include("corpus.jl") include("metadata.jl") include("preprocessing.jl") + + include("fastpreprocess.jl") + # Load libstemmer from our deps.jl const depsjl_path = joinpath(dirname(@__FILE__), "..", "deps", "deps.jl") if !isfile(depsjl_path) @@ -79,4 +82,5 @@ module TextAnalysis include("deprecations.jl") include("utils.jl") include("rouge.jl") + end diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl new file mode 100644 index 00000000..020f113a --- /dev/null +++ b/src/fastpreprocess.jl @@ -0,0 +1,34 @@ +# TODO +# * strip_sparse_terms +# * strip_frequent_terms +# * strip_html_tags +# * strip_non_letters +""" +Preprocessing functions + +* strip_case + +* corrupt_utf8 +* whitespace +* punctuation +* numbers +* indefinite_articles +* definite_articles +* articles +* stopwords +* prepositions +* pronouns +""" +mutable struct PreprocessBuffer + input::Vector{Char} + buffer::Vector{Char} + idx::Int +end + +PreprocessBuffer(input) = PreprocessBuffer(input, [], 1) + +PreprocessBuffer(input::AbstractString) = PreprocessBuffer(collect(input)) + +Base.getindex(ps::PreprocessBuffer, i = ps.idx) = ps.input[i] + +isdone(ps::PreprocessBuffer) = ps.idx > length(ps.input) From 08c41c58a23b2058ad2ba5cd158a4b94c206d671 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 12 Jun 2019 10:15:17 +0530 Subject: [PATCH 02/14] Lexers for preprocessbuffer --- src/fastpreprocess.jl | 93 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index 020f113a..272263bf 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -32,3 +32,96 @@ PreprocessBuffer(input::AbstractString) = PreprocessBuffer(collect(input)) Base.getindex(ps::PreprocessBuffer, i = ps.idx) = ps.input[i] isdone(ps::PreprocessBuffer) = ps.idx > length(ps.input) + +""" + corrupt_utf8(ps::PreprocessBuffer) + +Removes the corrupt UTF8 chars. +""" +function corrupt_utf8(ps) + return false +end + +""" + whitespace(ps::PreprocessBuffer) + +Squash multiple whitespaces to a single one. +And remove all leading and trailing whitespaces. +""" +function whitespace(ps) + return false +end + +""" + punctuation(ps::PreprocessBuffer) + +Squash multiple whitespaces to a single one. +And remove all leading and trailing whitespaces. + +""" +function punctuation(ps) + return false +end + +""" + numbers(::PreprocessBuffer) + +Removes all numbers. + +""" +function numbers(ps) + return false +end + +""" + lookahead(::PreprocessBuffer, s; boundary = false) + +Peek at the input to see if `s` is coming up next. `boundary` specifies whether +a word boundary should follow `s`. + +``` +julia> lookahead(PreprocessBuffer("foo bar"), "foo") +true +julia> lookahead(PreprocessBuffer("foo bar"), "bar") +false +julia> lookahead(PreprocessBuffer("foo bar"), "foo", boundary = true) +true +julia> lookahead(PreprocessBuffer("foobar"), "foo", boundary = true) +false +``` +""" +function lookahead(ps::PreprocessBuffer, s; boundary = false) + ps.idx + length(s) - 1 > length(ps.input) && return false + + for j = 1:length(s) + ps.input[ps.idx - 1 + j] == s[j] || return false + end + if boundary + next = ps.idx + length(s) + next > length(ps.input) && return true + (isletter(ps[next]) || ps[next] == '-') && return false + end + return true +end + +""" +Matches true for characters corresponding to Regex("[a-zA-Z0-9_]") +""" +word_character(ch) = isascii(ch) && (isuppercase(ch) || islowercase(ch) || + isdigit(ch) || ch == '_') + + +""" + words_remove(::PreprocessBuffer, ws) + +Removes ws from the PreprocessBuffer. +""" +function words_remove(ps, ws) + ps.idx != 1 && word_character(ps[ps.idx - 1]) && return false + for s in ws + lookahead(ps, s, boundary=true) || continue + ps.idx += length(s) + return true + end + return false +end From 84388ec77d6853e51543d01428ddb2d17549dbb6 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 14 Jun 2019 10:16:05 +0530 Subject: [PATCH 03/14] Preprocess function --- src/fastpreprocess.jl | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index 272263bf..75a1d170 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -125,3 +125,38 @@ function words_remove(ps, ws) end return false end + +function try_fast(text, lang) + length(text) < 1 && return + + indef_a = indefinite_articles(lang) + def_a = definite_articles(lang) + stop = stopwords(lang) + prepo = prepositions(lang) + pron = pronouns(lang) + + ws = vcat(indef_a, def_a, stop, prepo, pron) + + ps = PreprocessBuffer(text) + + # TODO: Check case insensitive in words + + while !isdone(ps) + (corrupt_utf8(ps) || + whitespace(ps) || + punctuation(ps) || + numbers(ps) || + words_remove(ps, ws)) && continue + + push!(ps.buffer, ps[]) + ps.idx += 1 + end + + return String(ps.buffer) +end + +function try_fast(doc::StringDocument) + doc.text = try_fast(doc.text, Languages.English()) +end + +# HTML placed before words From 4f90d7b664ef1eb060259cd7e15ecca90cc0230a Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 16 Jun 2019 13:22:59 +0530 Subject: [PATCH 04/14] Speed up PreprocessBuffer 10x --- src/TextAnalysis.jl | 1 + src/fastpreprocess.jl | 67 ++++++++++++++++++++++++++----------------- 2 files changed, 41 insertions(+), 27 deletions(-) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index ff9a7a71..2c3a22b3 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -6,6 +6,7 @@ module TextAnalysis using Languages using DataFrames using WordTokenizers + using DataStructures import DataFrames.DataFrame import Base.depwarn diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index 75a1d170..0794a968 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -42,22 +42,11 @@ function corrupt_utf8(ps) return false end -""" - whitespace(ps::PreprocessBuffer) - -Squash multiple whitespaces to a single one. -And remove all leading and trailing whitespaces. -""" -function whitespace(ps) - return false -end - """ punctuation(ps::PreprocessBuffer) Squash multiple whitespaces to a single one. And remove all leading and trailing whitespaces. - """ function punctuation(ps) return false @@ -67,7 +56,6 @@ end numbers(::PreprocessBuffer) Removes all numbers. - """ function numbers(ps) return false @@ -104,26 +92,49 @@ function lookahead(ps::PreprocessBuffer, s; boundary = false) return true end +""" +Helper function for words_remove. +""" +function next_token(ps::PreprocessBuffer) + i = ps.idx + while i < length(ps.input) && isletter(ps[i]) + i += 1 + end + + return String(ps.input[ps.idx:i-1]) +end + """ Matches true for characters corresponding to Regex("[a-zA-Z0-9_]") """ word_character(ch) = isascii(ch) && (isuppercase(ch) || islowercase(ch) || isdigit(ch) || ch == '_') - """ words_remove(::PreprocessBuffer, ws) -Removes ws from the PreprocessBuffer. +Removes words from the PreprocessBuffer. """ function words_remove(ps, ws) ps.idx != 1 && word_character(ps[ps.idx - 1]) && return false - for s in ws - lookahead(ps, s, boundary=true) || continue - ps.idx += length(s) - return true + isletter(ps[ps.idx]) || return false + + token = next_token(ps) + # println(token) + + if token ∉ ws + append!(ps.buffer, ps.input[ps.idx:ps.idx + length(token) ]) + ps.idx = ps.idx + length(token) + 1 + else + ps.idx += length(token) end - return false + + return true +end + +function next(ps::PreprocessBuffer) + push!(ps.buffer, ps[]) + ps.idx += 1 end function try_fast(text, lang) @@ -135,21 +146,16 @@ function try_fast(text, lang) prepo = prepositions(lang) pron = pronouns(lang) - ws = vcat(indef_a, def_a, stop, prepo, pron) - + ws = SortedSet(vcat(indef_a, def_a, stop, prepo, pron)) ps = PreprocessBuffer(text) # TODO: Check case insensitive in words while !isdone(ps) - (corrupt_utf8(ps) || - whitespace(ps) || + corrupt_utf8(ps) || punctuation(ps) || numbers(ps) || - words_remove(ps, ws)) && continue - - push!(ps.buffer, ps[]) - ps.idx += 1 + words_remove(ps, ws) || next(ps) end return String(ps.buffer) @@ -159,4 +165,11 @@ function try_fast(doc::StringDocument) doc.text = try_fast(doc.text, Languages.English()) end +# Only for String Document +function fastpreprocess(crps::Corpus, flags) + + # strip + +end + # HTML placed before words From 1d7cb5dcff9461d3b997c1c2c336970c7647c446 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 16 Jun 2019 14:08:29 +0530 Subject: [PATCH 05/14] Change to fastpreprocess --- src/fastpreprocess.jl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index 0794a968..38a0735b 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -137,7 +137,8 @@ function next(ps::PreprocessBuffer) ps.idx += 1 end -function try_fast(text, lang) +# ws of type Sorted Set +function fastpreprocess(text::String, lang) length(text) < 1 && return indef_a = indefinite_articles(lang) @@ -161,15 +162,11 @@ function try_fast(text, lang) return String(ps.buffer) end -function try_fast(doc::StringDocument) +function fastpreprocess(doc::StringDocument) doc.text = try_fast(doc.text, Languages.English()) end # Only for String Document function fastpreprocess(crps::Corpus, flags) - - # strip - end - # HTML placed before words From 6284fb1c73fc783127a9fd26591c9bd076b80faf Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 24 Jun 2019 02:42:50 +0530 Subject: [PATCH 06/14] Remove Buffer, speed up 8x times --- src/fastpreprocess.jl | 86 ++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index 38a0735b..32f71e43 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -21,11 +21,11 @@ Preprocessing functions """ mutable struct PreprocessBuffer input::Vector{Char} - buffer::Vector{Char} + # buffer::Vector{Char} idx::Int end -PreprocessBuffer(input) = PreprocessBuffer(input, [], 1) +PreprocessBuffer(input) = PreprocessBuffer(input, 1) PreprocessBuffer(input::AbstractString) = PreprocessBuffer(collect(input)) @@ -61,47 +61,48 @@ function numbers(ps) return false end -""" - lookahead(::PreprocessBuffer, s; boundary = false) - -Peek at the input to see if `s` is coming up next. `boundary` specifies whether -a word boundary should follow `s`. - -``` -julia> lookahead(PreprocessBuffer("foo bar"), "foo") -true -julia> lookahead(PreprocessBuffer("foo bar"), "bar") -false -julia> lookahead(PreprocessBuffer("foo bar"), "foo", boundary = true) -true -julia> lookahead(PreprocessBuffer("foobar"), "foo", boundary = true) -false -``` -""" -function lookahead(ps::PreprocessBuffer, s; boundary = false) - ps.idx + length(s) - 1 > length(ps.input) && return false - - for j = 1:length(s) - ps.input[ps.idx - 1 + j] == s[j] || return false - end - if boundary - next = ps.idx + length(s) - next > length(ps.input) && return true - (isletter(ps[next]) || ps[next] == '-') && return false - end - return true -end +# """ +# lookahead(::PreprocessBuffer, s; boundary = false) +# +# Peek at the input to see if `s` is coming up next. `boundary` specifies whether +# a word boundary should follow `s`. +# +# ``` +# julia> lookahead(PreprocessBuffer("foo bar"), "foo") +# true +# julia> lookahead(PreprocessBuffer("foo bar"), "bar") +# false +# julia> lookahead(PreprocessBuffer("foo bar"), "foo", boundary = true) +# true +# julia> lookahead(PreprocessBuffer("foobar"), "foo", boundary = true) +# false +# ``` +# """ +# function lookahead(ps::PreprocessBuffer, s; boundary = false) +# ps.idx + length(s) - 1 > length(ps.input) && return false +# +# for j = 1:length(s) +# ps.input[ps.idx - 1 + j] == s[j] || return false +# end +# if boundary +# next = ps.idx + length(s) +# next > length(ps.input) && return true +# (isletter(ps[next]) || ps[next] == '-') && return false +# end +# return true +# end """ Helper function for words_remove. """ -function next_token(ps::PreprocessBuffer) +function next_token(ps::PreprocessBuffer, ws) i = ps.idx while i < length(ps.input) && isletter(ps[i]) i += 1 end - return String(ps.input[ps.idx:i-1]) + String(ps.input[ps.idx:i-1]) ∈ ws && return true, i + return false, i end """ @@ -119,21 +120,21 @@ function words_remove(ps, ws) ps.idx != 1 && word_character(ps[ps.idx - 1]) && return false isletter(ps[ps.idx]) || return false - token = next_token(ps) + match, i = next_token(ps, ws) # println(token) - if token ∉ ws - append!(ps.buffer, ps.input[ps.idx:ps.idx + length(token) ]) - ps.idx = ps.idx + length(token) + 1 + if match == false + ps.idx = ps.idx + i else - ps.idx += length(token) + deleteat!(ps.input, ps.idx:i - 1) + ps.idx += 1 end return true end function next(ps::PreprocessBuffer) - push!(ps.buffer, ps[]) + # push!(ps.buffer, ps[]) ps.idx += 1 end @@ -159,11 +160,12 @@ function fastpreprocess(text::String, lang) words_remove(ps, ws) || next(ps) end - return String(ps.buffer) + return String(ps.input) end function fastpreprocess(doc::StringDocument) - doc.text = try_fast(doc.text, Languages.English()) + doc.text = fastpreprocess(doc.text, Languages.English()) + println() end # Only for String Document From 5157c0d2cc8fecf6b8ae52548f32f3153dae51d5 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 24 Jun 2019 14:02:19 +0530 Subject: [PATCH 07/14] Fix minor bugz --- src/fastpreprocess.jl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index 32f71e43..4c4d59ba 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -33,6 +33,7 @@ Base.getindex(ps::PreprocessBuffer, i = ps.idx) = ps.input[i] isdone(ps::PreprocessBuffer) = ps.idx > length(ps.input) +# TODO: Remove whitespace at the end, beginning and multiple whitepsaces into one. """ corrupt_utf8(ps::PreprocessBuffer) @@ -97,7 +98,7 @@ Helper function for words_remove. """ function next_token(ps::PreprocessBuffer, ws) i = ps.idx - while i < length(ps.input) && isletter(ps[i]) + while i <= length(ps.input) && isletter(ps[i]) i += 1 end @@ -121,11 +122,14 @@ function words_remove(ps, ws) isletter(ps[ps.idx]) || return false match, i = next_token(ps, ws) - # println(token) if match == false ps.idx = ps.idx + i else + if ps.idx > 1 && isspace(ps[ps.idx - 1]) + ps.idx -= 1 + end + deleteat!(ps.input, ps.idx:i - 1) ps.idx += 1 end @@ -134,13 +138,12 @@ function words_remove(ps, ws) end function next(ps::PreprocessBuffer) - # push!(ps.buffer, ps[]) ps.idx += 1 end # ws of type Sorted Set -function fastpreprocess(text::String, lang) - length(text) < 1 && return +function fastpreprocess(txt::String, lang) + length(txt) < 1 && return indef_a = indefinite_articles(lang) def_a = definite_articles(lang) @@ -149,7 +152,7 @@ function fastpreprocess(text::String, lang) pron = pronouns(lang) ws = SortedSet(vcat(indef_a, def_a, stop, prepo, pron)) - ps = PreprocessBuffer(text) + ps = PreprocessBuffer(txt) # TODO: Check case insensitive in words @@ -165,7 +168,7 @@ end function fastpreprocess(doc::StringDocument) doc.text = fastpreprocess(doc.text, Languages.English()) - println() + nothing end # Only for String Document From ee43f17e3a9415f12bca41d6351e16bb16923447 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 24 Jun 2019 14:10:48 +0530 Subject: [PATCH 08/14] Corrupt utf8 --- src/fastpreprocess.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index 4c4d59ba..ea3207e4 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -40,7 +40,10 @@ isdone(ps::PreprocessBuffer) = ps.idx > length(ps.input) Removes the corrupt UTF8 chars. """ function corrupt_utf8(ps) - return false + isvalid(ps[ps.idx]) && return false + + deleteat!(ps, ps.idx) + return true end """ From ce66660cc6fe92bee19b2c4442d24cf6cc13eab9 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 24 Jun 2019 19:51:08 +0530 Subject: [PATCH 09/14] Add functions for whitespaces, numbers, punct --- src/fastpreprocess.jl | 52 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index ea3207e4..64a3acfd 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -47,13 +47,49 @@ function corrupt_utf8(ps) end """ - punctuation(ps::PreprocessBuffer) + whitespace(ps::PreprocessBuffer) Squash multiple whitespaces to a single one. And remove all leading and trailing whitespaces. """ +function whitespace(ps) + isspace(ps) || return false + + ps.idx != 1 && !isspace(ps[ps.idx - 1]) && return false + + deleteat!(ps, ps.idx) + return true + + # If prev is whitespace then delete. +end + +function trailing_whitespace(ps) + isspace(ps[i]) || return + + i = length(ps.input) + + while (i > 0) && isspace(ps[i]) + i -= 1 + end + + deleteat!(ps, i + 1: length(ps.input)) +end + +""" + punctuation(ps::PreprocessBuffer) + +Remove punctuations. +""" function punctuation(ps) - return false + ispunct(ps[]) || return false + + if ps.idx > 1 && isspace(ps[ps.idx - 1]) + deleteat!(ps, ps.idx - 1:ps.idx) + else + deleteat!(ps, ps.idx) + end + + return true end """ @@ -62,7 +98,15 @@ end Removes all numbers. """ function numbers(ps) - return false + isdigit(ps[]) || return false + + if ps.idx > 1 && isspace(ps[ps.idx - 1]) + deleteat!(ps, ps.idx - 1:ps.idx) + else + deleteat!(ps, ps.idx) + end + + return true end # """ @@ -166,6 +210,8 @@ function fastpreprocess(txt::String, lang) words_remove(ps, ws) || next(ps) end + trailing_whitespace(ps) + return String(ps.input) end From 4ec1f714b55c4d136c5fd5ca764989964dbc0f48 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 24 Jun 2019 20:32:48 +0530 Subject: [PATCH 10/14] Minor bug fixes --- src/fastpreprocess.jl | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index 64a3acfd..b95f37b9 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -53,26 +53,25 @@ Squash multiple whitespaces to a single one. And remove all leading and trailing whitespaces. """ function whitespace(ps) - isspace(ps) || return false + isspace(ps[ps.idx]) || return false - ps.idx != 1 && !isspace(ps[ps.idx - 1]) && return false + ps.idx != 1 && !isspace(ps[ps.idx - 1]) && return next(ps) - deleteat!(ps, ps.idx) + deleteat!(ps.input, ps.idx) return true # If prev is whitespace then delete. end function trailing_whitespace(ps) - isspace(ps[i]) || return - - i = length(ps.input) + isspace(ps[length(ps.input)]) || return + i = length(ps.input) - 1 while (i > 0) && isspace(ps[i]) i -= 1 end - deleteat!(ps, i + 1: length(ps.input)) + deleteat!(ps.input, i + 1: length(ps.input)) end """ @@ -83,12 +82,7 @@ Remove punctuations. function punctuation(ps) ispunct(ps[]) || return false - if ps.idx > 1 && isspace(ps[ps.idx - 1]) - deleteat!(ps, ps.idx - 1:ps.idx) - else - deleteat!(ps, ps.idx) - end - + deleteat!(ps.input, ps.idx) return true end @@ -100,12 +94,7 @@ Removes all numbers. function numbers(ps) isdigit(ps[]) || return false - if ps.idx > 1 && isspace(ps[ps.idx - 1]) - deleteat!(ps, ps.idx - 1:ps.idx) - else - deleteat!(ps, ps.idx) - end - + deleteat!(ps.input, ps.idx) return true end @@ -148,6 +137,7 @@ function next_token(ps::PreprocessBuffer, ws) while i <= length(ps.input) && isletter(ps[i]) i += 1 end + i < length(ps.input) && isdigit(ps[i]) && return false, i String(ps.input[ps.idx:i-1]) ∈ ws && return true, i return false, i @@ -171,14 +161,9 @@ function words_remove(ps, ws) match, i = next_token(ps, ws) if match == false - ps.idx = ps.idx + i + ps.idx = i else - if ps.idx > 1 && isspace(ps[ps.idx - 1]) - ps.idx -= 1 - end - deleteat!(ps.input, ps.idx:i - 1) - ps.idx += 1 end return true @@ -186,6 +171,7 @@ end function next(ps::PreprocessBuffer) ps.idx += 1 + return true end # ws of type Sorted Set @@ -204,13 +190,14 @@ function fastpreprocess(txt::String, lang) # TODO: Check case insensitive in words while !isdone(ps) + whitespace(ps) || corrupt_utf8(ps) || punctuation(ps) || numbers(ps) || words_remove(ps, ws) || next(ps) end - trailing_whitespace(ps) + # trailing_whitespace(ps) return String(ps.input) end From 851e746c1d522b63f6b45cdaf55ee29d8771722e Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 24 Jun 2019 21:06:11 +0530 Subject: [PATCH 11/14] Add docstrings for fastpreprocessing.jl --- src/fastpreprocess.jl | 95 +++++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index b95f37b9..23309e0f 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -1,13 +1,12 @@ -# TODO -# * strip_sparse_terms -# * strip_frequent_terms +# TODO Figure out the following: +# * strip_sparse_terms - to utilize `words_remove` and `sparse_terms` (of preprocessing.jl). +# * strip_frequent_terms - to utilize `words_remove` and `frequent_terms` (of preprocessing.jl). # * strip_html_tags # * strip_non_letters +# * strip_case """ Preprocessing functions -* strip_case - * corrupt_utf8 * whitespace * punctuation @@ -18,10 +17,28 @@ Preprocessing functions * stopwords * prepositions * pronouns + + +Turns a string into a readable and writable stream, +used for preprocessing and flushing out the processed text. + +Utility functions (lexers) such as `spaces` and `number` read characters from the stream +and match against it. + +Functions (lexers) return `true` or `false` to indicate whether they matched anything +in the input stream. They can therefore be combined easily, e.g. + + spacesornumber(ts) = whtiespace(ts) || numbers(ts) + +either deletes two consectutively read whitespaces or removes a number character, if matched. + +For certain cases like `strip_pronouns`, `strip_prepositions`, `strip_stopwords`, etc. +These are stored into a `SortedSet` for faster preprocessing and +matches words / tokens against the characters in the stream +in the function `words_remove`. """ mutable struct PreprocessBuffer input::Vector{Char} - # buffer::Vector{Char} idx::Int end @@ -63,6 +80,11 @@ function whitespace(ps) # If prev is whitespace then delete. end +""" + trailing_whitespace(ps::PreprocessBuffer) + +Remove the whitespaces at the end of the input stream. +""" function trailing_whitespace(ps) isspace(ps[length(ps.input)]) || return i = length(ps.input) - 1 @@ -77,7 +99,7 @@ end """ punctuation(ps::PreprocessBuffer) -Remove punctuations. +Remove punctuations, as matched by `ispunct`. """ function punctuation(ps) ispunct(ps[]) || return false @@ -98,39 +120,10 @@ function numbers(ps) return true end -# """ -# lookahead(::PreprocessBuffer, s; boundary = false) -# -# Peek at the input to see if `s` is coming up next. `boundary` specifies whether -# a word boundary should follow `s`. -# -# ``` -# julia> lookahead(PreprocessBuffer("foo bar"), "foo") -# true -# julia> lookahead(PreprocessBuffer("foo bar"), "bar") -# false -# julia> lookahead(PreprocessBuffer("foo bar"), "foo", boundary = true) -# true -# julia> lookahead(PreprocessBuffer("foobar"), "foo", boundary = true) -# false -# ``` -# """ -# function lookahead(ps::PreprocessBuffer, s; boundary = false) -# ps.idx + length(s) - 1 > length(ps.input) && return false -# -# for j = 1:length(s) -# ps.input[ps.idx - 1 + j] == s[j] || return false -# end -# if boundary -# next = ps.idx + length(s) -# next > length(ps.input) && return true -# (isletter(ps[next]) || ps[next] == '-') && return false -# end -# return true -# end - """ Helper function for words_remove. +Matches the next token in the stream against the `ws::SortedSet`. +Returns whether it matched and the idx of the token end """ function next_token(ps::PreprocessBuffer, ws) i = ps.idx @@ -174,7 +167,22 @@ function next(ps::PreprocessBuffer) return true end -# ws of type Sorted Set +""" +Preprocessing functions + +* strip_case + +* corrupt_utf8 +* whitespace +* punctuation +* numbers +* indefinite_articles +* definite_articles +* articles +* stopwords +* prepositions +* pronouns +""" function fastpreprocess(txt::String, lang) length(txt) < 1 && return @@ -185,6 +193,13 @@ function fastpreprocess(txt::String, lang) pron = pronouns(lang) ws = SortedSet(vcat(indef_a, def_a, stop, prepo, pron)) + + return fastpreprocess(txt, ws) +end + +function fastpreprocess(txt::String, ws::SortedSet) + length(txt) < 1 && return + ps = PreprocessBuffer(txt) # TODO: Check case insensitive in words @@ -197,7 +212,7 @@ function fastpreprocess(txt::String, lang) words_remove(ps, ws) || next(ps) end - # trailing_whitespace(ps) + trailing_whitespace(ps) return String(ps.input) end From da4a64ca9acdac3a2c0969a696113b9573bc98c3 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 24 Jun 2019 22:08:51 +0530 Subject: [PATCH 12/14] Add support for preprocessing over Corpus and Docs --- src/fastpreprocess.jl | 65 ++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index 23309e0f..1540deec 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -19,8 +19,8 @@ Preprocessing functions * pronouns -Turns a string into a readable and writable stream, -used for preprocessing and flushing out the processed text. +Turns a string into a readable and writable stream of `Char`s, +used for preprocessing and flushing out the processes text. Utility functions (lexers) such as `spaces` and `number` read characters from the stream and match against it. @@ -168,33 +168,48 @@ function next(ps::PreprocessBuffer) end """ -Preprocessing functions + fastpreprocess(::StringDocument, flags) + fastpreprocess(::Corpus, flags) + fastpreprocess(::String, lang::T, flags) where T <: Language + fastpreprocess(::String, ::SortedSet, flags) -* strip_case +## Preprocessing functions currently available * corrupt_utf8 * whitespace * punctuation * numbers -* indefinite_articles -* definite_articles -* articles -* stopwords -* prepositions -* pronouns + +### Flags for functions requiring `words_remove` + +* strip_indefinite_articles +* strip_definite_articles +* strip_articles +* strip_stopwords +* strip_prepositions +* strip_pronouns + +## Usage + + +## Note: + +This does not work for Corpora consisting of `FileDocument`, +`TokenDocument` or `NGramDocument` + """ -function fastpreprocess(txt::String, lang) - length(txt) < 1 && return +fastpreprocess(txt::String, lang, flags) = fastpreprocess(txt, build_set(flags, lang)) - indef_a = indefinite_articles(lang) - def_a = definite_articles(lang) - stop = stopwords(lang) - prepo = prepositions(lang) - pron = pronouns(lang) +function build_set(flags, lang) + ws = SortedSet() - ws = SortedSet(vcat(indef_a, def_a, stop, prepo, pron)) + ((flags & strip_indefinite_articles) > 0) && union!(ws, indefinite_articles(lang)) + ((flags & strip_definite_articles) > 0) && union!(ws, definite_articles(lang)) - return fastpreprocess(txt, ws) + ((flags & strip_prepositions) > 0) && union!(ws, prepositions(lang)) + ((flags & strip_pronouns) > 0) && union!(ws, pronouns(lang)) + ((flags & strip_stopwords) > 0) && union!(ws, stopwords(lang)) + ws end function fastpreprocess(txt::String, ws::SortedSet) @@ -217,12 +232,18 @@ function fastpreprocess(txt::String, ws::SortedSet) return String(ps.input) end -function fastpreprocess(doc::StringDocument) - doc.text = fastpreprocess(doc.text, Languages.English()) - nothing +function fastpreprocess(doc::StringDocument, flags) + doc.text = fastpreprocess(doc.text, build_set(flags, lang(doc))) end # Only for String Document function fastpreprocess(crps::Corpus, flags) + ws = build_set(flags, lang(crps[1])) + + for doc in crps + doc.text = fastpreprocess(doc.text, ws))) + end + crps end + # HTML placed before words From 8bc5f933db5914f4e5cd76dce8ac9ffa66420d1b Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 24 Jun 2019 22:39:42 +0530 Subject: [PATCH 13/14] Add tests for PreprocessBuffer. --- test/fastpreprocess.jl | 36 ++++++++++++++++++++++++++++++++++++ test/runtests.jl | 1 + 2 files changed, 37 insertions(+) create mode 100644 test/fastpreprocess.jl diff --git a/test/fastpreprocess.jl b/test/fastpreprocess.jl new file mode 100644 index 00000000..c174a661 --- /dev/null +++ b/test/fastpreprocess.jl @@ -0,0 +1,36 @@ +@testset "Preprocessing" begin + @testset "Words Removal" begin + doc = StringDocument("this is a the sample text") + fastpreprocess(doc, strip_articles) + @test isequal(doc.text, "this is sample text") + + doc = Document("this is the sample text") + fastpreprocess(doc, strip_definite_articles) + @test isequal(doc.text, "this is sample text") + + doc = Document("this is a sample text") + fastpreprocess(doc, strip_indefinite_articles) + @test isequal(doc.text, "this is sample text") + + doc = Document("this is on sample text") + fastpreprocess(doc, strip_prepositions) + @test isequal(doc.text, "this is sample text") + + doc = Document("this is my sample text") + fastpreprocess(doc, strip_pronouns) + @test isequal(doc.text, "this is sample text") + + doc = Document("this is sample text") + fastpreprocess(doc, strip_stopwords) + @test isequal(strip(doc.text), "sample text") + end + + # test Remove Corrupt UT8 + sd = StringDocument("abc") + fastpreprocess(sd) + @test sd.text == "abc" + + sd = StringDocument(String([0x43, 0xf0])) + fastpreprocess(sd) + @test sd.text == "C" +end diff --git a/test/runtests.jl b/test/runtests.jl index bffc62d2..c251bdbd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,6 +15,7 @@ include("document.jl") include("metadata.jl") include("corpus.jl") include("preprocessing.jl") +include("fastpreprocess.jl") include("dtm.jl") include("stemmer.jl") include("tf_idf.jl") From c81397281542a3bbbff8589d8a7b88801712d441 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 24 Jun 2019 22:40:18 +0530 Subject: [PATCH 14/14] Optional args in fastpreproces --- src/TextAnalysis.jl | 1 + src/fastpreprocess.jl | 21 +++++++++------------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 261d0ced..cdc761b3 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -46,6 +46,7 @@ module TextAnalysis export tf, tf_idf, lsa, lda, summarize export tf!, tf_idf!, lsa!, lda! export remove_patterns!, remove_patterns + export fastpreprocess, PreprocessBuffer export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl index 1540deec..faf97697 100644 --- a/src/fastpreprocess.jl +++ b/src/fastpreprocess.jl @@ -59,7 +59,7 @@ Removes the corrupt UTF8 chars. function corrupt_utf8(ps) isvalid(ps[ps.idx]) && return false - deleteat!(ps, ps.idx) + deleteat!(ps.input, ps.idx) return true end @@ -198,9 +198,9 @@ This does not work for Corpora consisting of `FileDocument`, `TokenDocument` or `NGramDocument` """ -fastpreprocess(txt::String, lang, flags) = fastpreprocess(txt, build_set(flags, lang)) +fastpreprocess(txt::String, lang = Languages.English(), flags = 0) = fastpreprocess(txt, build_set(flags, lang)) -function build_set(flags, lang) +function build_set(flags, lang = Languages.English()) ws = SortedSet() ((flags & strip_indefinite_articles) > 0) && union!(ws, indefinite_articles(lang)) @@ -212,13 +212,11 @@ function build_set(flags, lang) ws end +# TODO: Check case insensitive in words function fastpreprocess(txt::String, ws::SortedSet) length(txt) < 1 && return - ps = PreprocessBuffer(txt) - # TODO: Check case insensitive in words - while !isdone(ps) whitespace(ps) || corrupt_utf8(ps) || @@ -228,20 +226,19 @@ function fastpreprocess(txt::String, ws::SortedSet) end trailing_whitespace(ps) - return String(ps.input) end -function fastpreprocess(doc::StringDocument, flags) - doc.text = fastpreprocess(doc.text, build_set(flags, lang(doc))) +function fastpreprocess(doc::StringDocument, flags = 0) + doc.text = fastpreprocess(doc.text, build_set(flags, language(doc))) end # Only for String Document -function fastpreprocess(crps::Corpus, flags) - ws = build_set(flags, lang(crps[1])) +function fastpreprocess(crps::Corpus, flags = 0) + ws = build_set(flags, language(crps[1])) for doc in crps - doc.text = fastpreprocess(doc.text, ws))) + doc.text = fastpreprocess(doc.text, ws) end crps end