diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index e34285b0..cdc761b3 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -6,6 +6,7 @@ module TextAnalysis using Languages using DataFrames using WordTokenizers + using DataStructures import DataFrames.DataFrame import Base.depwarn @@ -45,6 +46,7 @@ module TextAnalysis export tf, tf_idf, lsa, lda, summarize export tf!, tf_idf!, lsa!, lda! export remove_patterns!, remove_patterns + export fastpreprocess, PreprocessBuffer export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles @@ -61,6 +63,9 @@ module TextAnalysis include("corpus.jl") include("metadata.jl") include("preprocessing.jl") + + include("fastpreprocess.jl") + # Load libstemmer from our deps.jl const depsjl_path = joinpath(dirname(@__FILE__), "..", "deps", "deps.jl") if !isfile(depsjl_path) diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl new file mode 100644 index 00000000..faf97697 --- /dev/null +++ b/src/fastpreprocess.jl @@ -0,0 +1,246 @@ +# TODO Figure out the following: +# * strip_sparse_terms - to utilize `words_remove` and `sparse_terms` (of preprocessing.jl). +# * strip_frequent_terms - to utilize `words_remove` and `frequent_terms` (of preprocessing.jl). +# * strip_html_tags +# * strip_non_letters +# * strip_case +""" +Preprocessing functions + +* corrupt_utf8 +* whitespace +* punctuation +* numbers +* indefinite_articles +* definite_articles +* articles +* stopwords +* prepositions +* pronouns + + +Turns a string into a readable and writable stream of `Char`s, +used for preprocessing and flushing out the processes text. + +Utility functions (lexers) such as `spaces` and `number` read characters from the stream +and match against it. + +Functions (lexers) return `true` or `false` to indicate whether they matched anything +in the input stream. They can therefore be combined easily, e.g. + + spacesornumber(ts) = whtiespace(ts) || numbers(ts) + +either deletes two consectutively read whitespaces or removes a number character, if matched. + +For certain cases like `strip_pronouns`, `strip_prepositions`, `strip_stopwords`, etc. +These are stored into a `SortedSet` for faster preprocessing and +matches words / tokens against the characters in the stream +in the function `words_remove`. +""" +mutable struct PreprocessBuffer + input::Vector{Char} + idx::Int +end + +PreprocessBuffer(input) = PreprocessBuffer(input, 1) + +PreprocessBuffer(input::AbstractString) = PreprocessBuffer(collect(input)) + +Base.getindex(ps::PreprocessBuffer, i = ps.idx) = ps.input[i] + +isdone(ps::PreprocessBuffer) = ps.idx > length(ps.input) + +# TODO: Remove whitespace at the end, beginning and multiple whitepsaces into one. +""" + corrupt_utf8(ps::PreprocessBuffer) + +Removes the corrupt UTF8 chars. +""" +function corrupt_utf8(ps) + isvalid(ps[ps.idx]) && return false + + deleteat!(ps.input, ps.idx) + return true +end + +""" + whitespace(ps::PreprocessBuffer) + +Squash multiple whitespaces to a single one. +And remove all leading and trailing whitespaces. +""" +function whitespace(ps) + isspace(ps[ps.idx]) || return false + + ps.idx != 1 && !isspace(ps[ps.idx - 1]) && return next(ps) + + deleteat!(ps.input, ps.idx) + return true + + # If prev is whitespace then delete. +end + +""" + trailing_whitespace(ps::PreprocessBuffer) + +Remove the whitespaces at the end of the input stream. +""" +function trailing_whitespace(ps) + isspace(ps[length(ps.input)]) || return + i = length(ps.input) - 1 + + while (i > 0) && isspace(ps[i]) + i -= 1 + end + + deleteat!(ps.input, i + 1: length(ps.input)) +end + +""" + punctuation(ps::PreprocessBuffer) + +Remove punctuations, as matched by `ispunct`. +""" +function punctuation(ps) + ispunct(ps[]) || return false + + deleteat!(ps.input, ps.idx) + return true +end + +""" + numbers(::PreprocessBuffer) + +Removes all numbers. +""" +function numbers(ps) + isdigit(ps[]) || return false + + deleteat!(ps.input, ps.idx) + return true +end + +""" +Helper function for words_remove. +Matches the next token in the stream against the `ws::SortedSet`. +Returns whether it matched and the idx of the token end +""" +function next_token(ps::PreprocessBuffer, ws) + i = ps.idx + while i <= length(ps.input) && isletter(ps[i]) + i += 1 + end + i < length(ps.input) && isdigit(ps[i]) && return false, i + + String(ps.input[ps.idx:i-1]) ∈ ws && return true, i + return false, i +end + +""" +Matches true for characters corresponding to Regex("[a-zA-Z0-9_]") +""" +word_character(ch) = isascii(ch) && (isuppercase(ch) || islowercase(ch) || + isdigit(ch) || ch == '_') + +""" + words_remove(::PreprocessBuffer, ws) + +Removes words from the PreprocessBuffer. +""" +function words_remove(ps, ws) + ps.idx != 1 && word_character(ps[ps.idx - 1]) && return false + isletter(ps[ps.idx]) || return false + + match, i = next_token(ps, ws) + + if match == false + ps.idx = i + else + deleteat!(ps.input, ps.idx:i - 1) + end + + return true +end + +function next(ps::PreprocessBuffer) + ps.idx += 1 + return true +end + +""" + fastpreprocess(::StringDocument, flags) + fastpreprocess(::Corpus, flags) + fastpreprocess(::String, lang::T, flags) where T <: Language + fastpreprocess(::String, ::SortedSet, flags) + +## Preprocessing functions currently available + +* corrupt_utf8 +* whitespace +* punctuation +* numbers + +### Flags for functions requiring `words_remove` + +* strip_indefinite_articles +* strip_definite_articles +* strip_articles +* strip_stopwords +* strip_prepositions +* strip_pronouns + +## Usage + + +## Note: + +This does not work for Corpora consisting of `FileDocument`, +`TokenDocument` or `NGramDocument` + +""" +fastpreprocess(txt::String, lang = Languages.English(), flags = 0) = fastpreprocess(txt, build_set(flags, lang)) + +function build_set(flags, lang = Languages.English()) + ws = SortedSet() + + ((flags & strip_indefinite_articles) > 0) && union!(ws, indefinite_articles(lang)) + ((flags & strip_definite_articles) > 0) && union!(ws, definite_articles(lang)) + + ((flags & strip_prepositions) > 0) && union!(ws, prepositions(lang)) + ((flags & strip_pronouns) > 0) && union!(ws, pronouns(lang)) + ((flags & strip_stopwords) > 0) && union!(ws, stopwords(lang)) + ws +end + +# TODO: Check case insensitive in words +function fastpreprocess(txt::String, ws::SortedSet) + length(txt) < 1 && return + ps = PreprocessBuffer(txt) + + while !isdone(ps) + whitespace(ps) || + corrupt_utf8(ps) || + punctuation(ps) || + numbers(ps) || + words_remove(ps, ws) || next(ps) + end + + trailing_whitespace(ps) + return String(ps.input) +end + +function fastpreprocess(doc::StringDocument, flags = 0) + doc.text = fastpreprocess(doc.text, build_set(flags, language(doc))) +end + +# Only for String Document +function fastpreprocess(crps::Corpus, flags = 0) + ws = build_set(flags, language(crps[1])) + + for doc in crps + doc.text = fastpreprocess(doc.text, ws) + end + crps +end + +# HTML placed before words diff --git a/test/fastpreprocess.jl b/test/fastpreprocess.jl new file mode 100644 index 00000000..c174a661 --- /dev/null +++ b/test/fastpreprocess.jl @@ -0,0 +1,36 @@ +@testset "Preprocessing" begin + @testset "Words Removal" begin + doc = StringDocument("this is a the sample text") + fastpreprocess(doc, strip_articles) + @test isequal(doc.text, "this is sample text") + + doc = Document("this is the sample text") + fastpreprocess(doc, strip_definite_articles) + @test isequal(doc.text, "this is sample text") + + doc = Document("this is a sample text") + fastpreprocess(doc, strip_indefinite_articles) + @test isequal(doc.text, "this is sample text") + + doc = Document("this is on sample text") + fastpreprocess(doc, strip_prepositions) + @test isequal(doc.text, "this is sample text") + + doc = Document("this is my sample text") + fastpreprocess(doc, strip_pronouns) + @test isequal(doc.text, "this is sample text") + + doc = Document("this is sample text") + fastpreprocess(doc, strip_stopwords) + @test isequal(strip(doc.text), "sample text") + end + + # test Remove Corrupt UT8 + sd = StringDocument("abc") + fastpreprocess(sd) + @test sd.text == "abc" + + sd = StringDocument(String([0x43, 0xf0])) + fastpreprocess(sd) + @test sd.text == "C" +end diff --git a/test/runtests.jl b/test/runtests.jl index bffc62d2..c251bdbd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,6 +15,7 @@ include("document.jl") include("metadata.jl") include("corpus.jl") include("preprocessing.jl") +include("fastpreprocess.jl") include("dtm.jl") include("stemmer.jl") include("tf_idf.jl")