From c478c5ead144c1dbbec3042f1b6f29796f8a883a Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Wed, 12 Jun 2019 10:14:05 +0530
Subject: [PATCH 01/14] Add PreprocessBuffer

---
 src/TextAnalysis.jl   |  4 ++++
 src/fastpreprocess.jl | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 src/fastpreprocess.jl

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index c2f88311..ff9a7a71 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -60,6 +60,9 @@ module TextAnalysis
     include("corpus.jl")
     include("metadata.jl")
     include("preprocessing.jl")
+
+    include("fastpreprocess.jl")
+
     # Load libstemmer from our deps.jl
     const depsjl_path = joinpath(dirname(@__FILE__), "..", "deps", "deps.jl")
     if !isfile(depsjl_path)
@@ -79,4 +82,5 @@ module TextAnalysis
     include("deprecations.jl")
     include("utils.jl")
     include("rouge.jl")
+
 end
diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
new file mode 100644
index 00000000..020f113a
--- /dev/null
+++ b/src/fastpreprocess.jl
@@ -0,0 +1,34 @@
+# TODO
+# * strip_sparse_terms
+# * strip_frequent_terms
+# * strip_html_tags
+# * strip_non_letters
+"""
+Preprocessing functions
+
+* strip_case
+
+* corrupt_utf8
+* whitespace
+* punctuation
+* numbers
+* indefinite_articles
+* definite_articles
+* articles
+* stopwords
+* prepositions
+* pronouns
+"""
+mutable struct PreprocessBuffer
+    input::Vector{Char}
+    buffer::Vector{Char}
+    idx::Int
+end
+
+PreprocessBuffer(input) = PreprocessBuffer(input, [], 1)
+
+PreprocessBuffer(input::AbstractString) = PreprocessBuffer(collect(input))
+
+Base.getindex(ps::PreprocessBuffer, i = ps.idx) = ps.input[i]
+
+isdone(ps::PreprocessBuffer) = ps.idx > length(ps.input)

From 08c41c58a23b2058ad2ba5cd158a4b94c206d671 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Wed, 12 Jun 2019 10:15:17 +0530
Subject: [PATCH 02/14] Lexers for preprocessbuffer

---
 src/fastpreprocess.jl | 93 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index 020f113a..272263bf 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -32,3 +32,96 @@ PreprocessBuffer(input::AbstractString) = PreprocessBuffer(collect(input))
 Base.getindex(ps::PreprocessBuffer, i = ps.idx) = ps.input[i]
 
 isdone(ps::PreprocessBuffer) = ps.idx > length(ps.input)
+
+"""
+    corrupt_utf8(ps::PreprocessBuffer)
+
+Removes the corrupt UTF8 chars.
+"""
+function corrupt_utf8(ps)
+    return false
+end
+
+"""
+    whitespace(ps::PreprocessBuffer)
+
+Squash multiple whitespaces to a single one.
+And remove all leading and trailing whitespaces.
+"""
+function whitespace(ps)
+    return false
+end
+
+"""
+    punctuation(ps::PreprocessBuffer)
+
+Squash multiple whitespaces to a single one.
+And remove all leading and trailing whitespaces.
+
+"""
+function punctuation(ps)
+    return false
+end
+
+"""
+    numbers(::PreprocessBuffer)
+
+Removes all numbers.
+
+"""
+function numbers(ps)
+    return false
+end
+
+"""
+    lookahead(::PreprocessBuffer, s; boundary = false)
+
+Peek at the input to see if `s` is coming up next. `boundary` specifies whether
+a word boundary should follow `s`.
+
+```
+julia> lookahead(PreprocessBuffer("foo bar"), "foo")
+true
+julia> lookahead(PreprocessBuffer("foo bar"), "bar")
+false
+julia> lookahead(PreprocessBuffer("foo bar"), "foo", boundary = true)
+true
+julia> lookahead(PreprocessBuffer("foobar"), "foo", boundary = true)
+false
+```
+"""
+function lookahead(ps::PreprocessBuffer, s; boundary = false)
+    ps.idx + length(s) - 1 > length(ps.input) && return false
+
+    for j = 1:length(s)
+        ps.input[ps.idx - 1 + j] == s[j] || return false
+    end
+    if boundary
+        next = ps.idx + length(s)
+        next > length(ps.input) && return true
+        (isletter(ps[next]) || ps[next] == '-') && return false
+    end
+    return true
+end
+
+"""
+Matches true for characters corresponding to Regex("[a-zA-Z0-9_]")
+"""
+word_character(ch) = isascii(ch) && (isuppercase(ch) || islowercase(ch) ||
+                                            isdigit(ch) || ch == '_')
+
+
+"""
+    words_remove(::PreprocessBuffer, ws)
+
+Removes ws from the PreprocessBuffer.
+"""
+function words_remove(ps, ws)
+    ps.idx != 1 && word_character(ps[ps.idx - 1]) && return false
+    for s in ws
+        lookahead(ps, s, boundary=true) || continue
+        ps.idx += length(s)
+        return true
+    end
+    return false
+end

From 84388ec77d6853e51543d01428ddb2d17549dbb6 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Fri, 14 Jun 2019 10:16:05 +0530
Subject: [PATCH 03/14] Preprocess function

---
 src/fastpreprocess.jl | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index 272263bf..75a1d170 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -125,3 +125,38 @@ function words_remove(ps, ws)
     end
     return false
 end
+
+function try_fast(text, lang)
+    length(text) < 1 && return
+
+    indef_a = indefinite_articles(lang)
+    def_a = definite_articles(lang)
+    stop = stopwords(lang)
+    prepo = prepositions(lang)
+    pron = pronouns(lang)
+
+    ws = vcat(indef_a, def_a, stop, prepo, pron)
+
+    ps = PreprocessBuffer(text)
+
+    # TODO: Check case insensitive in words
+
+    while !isdone(ps)
+        (corrupt_utf8(ps) ||
+        whitespace(ps) ||
+        punctuation(ps) ||
+        numbers(ps) ||
+        words_remove(ps, ws)) && continue
+
+        push!(ps.buffer, ps[])
+        ps.idx += 1
+    end
+
+    return String(ps.buffer)
+end
+
+function try_fast(doc::StringDocument)
+    doc.text =  try_fast(doc.text, Languages.English())
+end
+
+# HTML placed before words

From 4f90d7b664ef1eb060259cd7e15ecca90cc0230a Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sun, 16 Jun 2019 13:22:59 +0530
Subject: [PATCH 04/14] Speed up PreprocessBuffer 10x

---
 src/TextAnalysis.jl   |  1 +
 src/fastpreprocess.jl | 67 ++++++++++++++++++++++++++-----------------
 2 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index ff9a7a71..2c3a22b3 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -6,6 +6,7 @@ module TextAnalysis
     using Languages
     using DataFrames
     using WordTokenizers
+    using DataStructures
 
     import DataFrames.DataFrame
     import Base.depwarn
diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index 75a1d170..0794a968 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -42,22 +42,11 @@ function corrupt_utf8(ps)
     return false
 end
 
-"""
-    whitespace(ps::PreprocessBuffer)
-
-Squash multiple whitespaces to a single one.
-And remove all leading and trailing whitespaces.
-"""
-function whitespace(ps)
-    return false
-end
-
 """
     punctuation(ps::PreprocessBuffer)
 
 Squash multiple whitespaces to a single one.
 And remove all leading and trailing whitespaces.
-
 """
 function punctuation(ps)
     return false
@@ -67,7 +56,6 @@ end
     numbers(::PreprocessBuffer)
 
 Removes all numbers.
-
 """
 function numbers(ps)
     return false
@@ -104,26 +92,49 @@ function lookahead(ps::PreprocessBuffer, s; boundary = false)
     return true
 end
 
+"""
+Helper function for words_remove.
+"""
+function next_token(ps::PreprocessBuffer)
+    i = ps.idx
+    while i < length(ps.input) && isletter(ps[i])
+        i += 1
+    end
+
+    return String(ps.input[ps.idx:i-1])
+end
+
 """
 Matches true for characters corresponding to Regex("[a-zA-Z0-9_]")
 """
 word_character(ch) = isascii(ch) && (isuppercase(ch) || islowercase(ch) ||
                                             isdigit(ch) || ch == '_')
 
-
 """
     words_remove(::PreprocessBuffer, ws)
 
-Removes ws from the PreprocessBuffer.
+Removes words from the PreprocessBuffer.
 """
 function words_remove(ps, ws)
     ps.idx != 1 && word_character(ps[ps.idx - 1]) && return false
-    for s in ws
-        lookahead(ps, s, boundary=true) || continue
-        ps.idx += length(s)
-        return true
+    isletter(ps[ps.idx]) || return false
+
+    token = next_token(ps)
+    # println(token)
+
+    if token ∉ ws
+        append!(ps.buffer, ps.input[ps.idx:ps.idx + length(token) ])
+        ps.idx = ps.idx + length(token) + 1
+    else
+        ps.idx += length(token)
     end
-    return false
+
+    return true
+end
+
+function next(ps::PreprocessBuffer)
+    push!(ps.buffer, ps[])
+    ps.idx += 1
 end
 
 function try_fast(text, lang)
@@ -135,21 +146,16 @@ function try_fast(text, lang)
     prepo = prepositions(lang)
     pron = pronouns(lang)
 
-    ws = vcat(indef_a, def_a, stop, prepo, pron)
-
+    ws = SortedSet(vcat(indef_a, def_a, stop, prepo, pron))
     ps = PreprocessBuffer(text)
 
     # TODO: Check case insensitive in words
 
     while !isdone(ps)
-        (corrupt_utf8(ps) ||
-        whitespace(ps) ||
+        corrupt_utf8(ps) ||
         punctuation(ps) ||
         numbers(ps) ||
-        words_remove(ps, ws)) && continue
-
-        push!(ps.buffer, ps[])
-        ps.idx += 1
+        words_remove(ps, ws) || next(ps)
     end
 
     return String(ps.buffer)
@@ -159,4 +165,11 @@ function try_fast(doc::StringDocument)
     doc.text =  try_fast(doc.text, Languages.English())
 end
 
+# Only for String Document
+function fastpreprocess(crps::Corpus, flags)
+
+    # strip
+
+end
+
 # HTML placed before words

From 1d7cb5dcff9461d3b997c1c2c336970c7647c446 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sun, 16 Jun 2019 14:08:29 +0530
Subject: [PATCH 05/14] Change to fastpreprocess

---
 src/fastpreprocess.jl | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index 0794a968..38a0735b 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -137,7 +137,8 @@ function next(ps::PreprocessBuffer)
     ps.idx += 1
 end
 
-function try_fast(text, lang)
+# ws of type Sorted Set
+function fastpreprocess(text::String, lang)
     length(text) < 1 && return
 
     indef_a = indefinite_articles(lang)
@@ -161,15 +162,11 @@ function try_fast(text, lang)
     return String(ps.buffer)
 end
 
-function try_fast(doc::StringDocument)
+function fastpreprocess(doc::StringDocument)
     doc.text =  try_fast(doc.text, Languages.English())
 end
 
 # Only for String Document
 function fastpreprocess(crps::Corpus, flags)
-
-    # strip
-
 end
-
 # HTML placed before words

From 6284fb1c73fc783127a9fd26591c9bd076b80faf Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 24 Jun 2019 02:42:50 +0530
Subject: [PATCH 06/14] Remove Buffer, speed up 8x times

---
 src/fastpreprocess.jl | 86 ++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 42 deletions(-)

diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index 38a0735b..32f71e43 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -21,11 +21,11 @@ Preprocessing functions
 """
 mutable struct PreprocessBuffer
     input::Vector{Char}
-    buffer::Vector{Char}
+    # buffer::Vector{Char}
     idx::Int
 end
 
-PreprocessBuffer(input) = PreprocessBuffer(input, [], 1)
+PreprocessBuffer(input) = PreprocessBuffer(input, 1)
 
 PreprocessBuffer(input::AbstractString) = PreprocessBuffer(collect(input))
 
@@ -61,47 +61,48 @@ function numbers(ps)
     return false
 end
 
-"""
-    lookahead(::PreprocessBuffer, s; boundary = false)
-
-Peek at the input to see if `s` is coming up next. `boundary` specifies whether
-a word boundary should follow `s`.
-
-```
-julia> lookahead(PreprocessBuffer("foo bar"), "foo")
-true
-julia> lookahead(PreprocessBuffer("foo bar"), "bar")
-false
-julia> lookahead(PreprocessBuffer("foo bar"), "foo", boundary = true)
-true
-julia> lookahead(PreprocessBuffer("foobar"), "foo", boundary = true)
-false
-```
-"""
-function lookahead(ps::PreprocessBuffer, s; boundary = false)
-    ps.idx + length(s) - 1 > length(ps.input) && return false
-
-    for j = 1:length(s)
-        ps.input[ps.idx - 1 + j] == s[j] || return false
-    end
-    if boundary
-        next = ps.idx + length(s)
-        next > length(ps.input) && return true
-        (isletter(ps[next]) || ps[next] == '-') && return false
-    end
-    return true
-end
+# """
+#     lookahead(::PreprocessBuffer, s; boundary = false)
+#
+# Peek at the input to see if `s` is coming up next. `boundary` specifies whether
+# a word boundary should follow `s`.
+#
+# ```
+# julia> lookahead(PreprocessBuffer("foo bar"), "foo")
+# true
+# julia> lookahead(PreprocessBuffer("foo bar"), "bar")
+# false
+# julia> lookahead(PreprocessBuffer("foo bar"), "foo", boundary = true)
+# true
+# julia> lookahead(PreprocessBuffer("foobar"), "foo", boundary = true)
+# false
+# ```
+# """
+# function lookahead(ps::PreprocessBuffer, s; boundary = false)
+#     ps.idx + length(s) - 1 > length(ps.input) && return false
+#
+#     for j = 1:length(s)
+#         ps.input[ps.idx - 1 + j] == s[j] || return false
+#     end
+#     if boundary
+#         next = ps.idx + length(s)
+#         next > length(ps.input) && return true
+#         (isletter(ps[next]) || ps[next] == '-') && return false
+#     end
+#     return true
+# end
 
 """
 Helper function for words_remove.
 """
-function next_token(ps::PreprocessBuffer)
+function next_token(ps::PreprocessBuffer, ws)
     i = ps.idx
     while i < length(ps.input) && isletter(ps[i])
         i += 1
     end
 
-    return String(ps.input[ps.idx:i-1])
+    String(ps.input[ps.idx:i-1]) ∈ ws && return true, i
+    return false, i
 end
 
 """
@@ -119,21 +120,21 @@ function words_remove(ps, ws)
     ps.idx != 1 && word_character(ps[ps.idx - 1]) && return false
     isletter(ps[ps.idx]) || return false
 
-    token = next_token(ps)
+    match, i = next_token(ps, ws)
     # println(token)
 
-    if token ∉ ws
-        append!(ps.buffer, ps.input[ps.idx:ps.idx + length(token) ])
-        ps.idx = ps.idx + length(token) + 1
+    if match == false
+        ps.idx = ps.idx + i
     else
-        ps.idx += length(token)
+        deleteat!(ps.input, ps.idx:i - 1)
+        ps.idx += 1
     end
 
     return true
 end
 
 function next(ps::PreprocessBuffer)
-    push!(ps.buffer, ps[])
+    # push!(ps.buffer, ps[])
     ps.idx += 1
 end
 
@@ -159,11 +160,12 @@ function fastpreprocess(text::String, lang)
         words_remove(ps, ws) || next(ps)
     end
 
-    return String(ps.buffer)
+    return String(ps.input)
 end
 
 function fastpreprocess(doc::StringDocument)
-    doc.text =  try_fast(doc.text, Languages.English())
+    doc.text =  fastpreprocess(doc.text, Languages.English())
+    println()
 end
 
 # Only for String Document

From 5157c0d2cc8fecf6b8ae52548f32f3153dae51d5 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 24 Jun 2019 14:02:19 +0530
Subject: [PATCH 07/14] Fix minor bugz

---
 src/fastpreprocess.jl | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index 32f71e43..4c4d59ba 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -33,6 +33,7 @@ Base.getindex(ps::PreprocessBuffer, i = ps.idx) = ps.input[i]
 
 isdone(ps::PreprocessBuffer) = ps.idx > length(ps.input)
 
+# TODO: Remove whitespace at the end, beginning and multiple whitepsaces into one.
 """
     corrupt_utf8(ps::PreprocessBuffer)
 
@@ -97,7 +98,7 @@ Helper function for words_remove.
 """
 function next_token(ps::PreprocessBuffer, ws)
     i = ps.idx
-    while i < length(ps.input) && isletter(ps[i])
+    while i <= length(ps.input) && isletter(ps[i])
         i += 1
     end
 
@@ -121,11 +122,14 @@ function words_remove(ps, ws)
     isletter(ps[ps.idx]) || return false
 
     match, i = next_token(ps, ws)
-    # println(token)
 
     if match == false
         ps.idx = ps.idx + i
     else
+        if ps.idx > 1 && isspace(ps[ps.idx - 1])
+            ps.idx -= 1
+        end
+
         deleteat!(ps.input, ps.idx:i - 1)
         ps.idx += 1
     end
@@ -134,13 +138,12 @@ function words_remove(ps, ws)
 end
 
 function next(ps::PreprocessBuffer)
-    # push!(ps.buffer, ps[])
     ps.idx += 1
 end
 
 # ws of type Sorted Set
-function fastpreprocess(text::String, lang)
-    length(text) < 1 && return
+function fastpreprocess(txt::String, lang)
+    length(txt) < 1 && return
 
     indef_a = indefinite_articles(lang)
     def_a = definite_articles(lang)
@@ -149,7 +152,7 @@ function fastpreprocess(text::String, lang)
     pron = pronouns(lang)
 
     ws = SortedSet(vcat(indef_a, def_a, stop, prepo, pron))
-    ps = PreprocessBuffer(text)
+    ps = PreprocessBuffer(txt)
 
     # TODO: Check case insensitive in words
 
@@ -165,7 +168,7 @@ end
 
 function fastpreprocess(doc::StringDocument)
     doc.text =  fastpreprocess(doc.text, Languages.English())
-    println()
+    nothing
 end
 
 # Only for String Document

From ee43f17e3a9415f12bca41d6351e16bb16923447 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 24 Jun 2019 14:10:48 +0530
Subject: [PATCH 08/14] Corrupt utf8

---
 src/fastpreprocess.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index 4c4d59ba..ea3207e4 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -40,7 +40,10 @@ isdone(ps::PreprocessBuffer) = ps.idx > length(ps.input)
 Removes the corrupt UTF8 chars.
 """
 function corrupt_utf8(ps)
-    return false
+    isvalid(ps[ps.idx]) && return false
+
+    deleteat!(ps, ps.idx)
+    return true
 end
 
 """

From ce66660cc6fe92bee19b2c4442d24cf6cc13eab9 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 24 Jun 2019 19:51:08 +0530
Subject: [PATCH 09/14] Add functions for whitespaces, numbers, punct

---
 src/fastpreprocess.jl | 52 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index ea3207e4..64a3acfd 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -47,13 +47,49 @@ function corrupt_utf8(ps)
 end
 
 """
-    punctuation(ps::PreprocessBuffer)
+    whitespace(ps::PreprocessBuffer)
 
 Squash multiple whitespaces to a single one.
 And remove all leading and trailing whitespaces.
 """
+function whitespace(ps)
+    isspace(ps) || return false
+
+    ps.idx != 1 && !isspace(ps[ps.idx - 1]) && return false
+
+    deleteat!(ps, ps.idx)
+    return true
+
+    # If prev is whitespace then delete.
+end
+
+function trailing_whitespace(ps)
+    isspace(ps[i]) || return
+
+    i = length(ps.input)
+
+    while (i > 0) && isspace(ps[i])
+        i -= 1
+    end
+
+    deleteat!(ps, i + 1: length(ps.input))
+end
+
+"""
+    punctuation(ps::PreprocessBuffer)
+
+Remove punctuations.
+"""
 function punctuation(ps)
-    return false
+    ispunct(ps[]) || return false
+
+    if ps.idx > 1 && isspace(ps[ps.idx - 1])
+        deleteat!(ps, ps.idx - 1:ps.idx)
+    else
+        deleteat!(ps, ps.idx)
+    end
+
+    return true
 end
 
 """
@@ -62,7 +98,15 @@ end
 Removes all numbers.
 """
 function numbers(ps)
-    return false
+    isdigit(ps[]) || return false
+
+    if ps.idx > 1 && isspace(ps[ps.idx - 1])
+        deleteat!(ps, ps.idx - 1:ps.idx)
+    else
+        deleteat!(ps, ps.idx)
+    end
+
+    return true
 end
 
 # """
@@ -166,6 +210,8 @@ function fastpreprocess(txt::String, lang)
         words_remove(ps, ws) || next(ps)
     end
 
+    trailing_whitespace(ps)
+
     return String(ps.input)
 end
 

From 4ec1f714b55c4d136c5fd5ca764989964dbc0f48 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 24 Jun 2019 20:32:48 +0530
Subject: [PATCH 10/14] Minor bug fixes

---
 src/fastpreprocess.jl | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index 64a3acfd..b95f37b9 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -53,26 +53,25 @@ Squash multiple whitespaces to a single one.
 And remove all leading and trailing whitespaces.
 """
 function whitespace(ps)
-    isspace(ps) || return false
+    isspace(ps[ps.idx]) || return false
 
-    ps.idx != 1 && !isspace(ps[ps.idx - 1]) && return false
+    ps.idx != 1 && !isspace(ps[ps.idx - 1]) && return next(ps)
 
-    deleteat!(ps, ps.idx)
+    deleteat!(ps.input, ps.idx)
     return true
 
     # If prev is whitespace then delete.
 end
 
 function trailing_whitespace(ps)
-    isspace(ps[i]) || return
-
-    i = length(ps.input)
+    isspace(ps[length(ps.input)]) || return
+    i = length(ps.input) - 1
 
     while (i > 0) && isspace(ps[i])
         i -= 1
     end
 
-    deleteat!(ps, i + 1: length(ps.input))
+    deleteat!(ps.input, i + 1: length(ps.input))
 end
 
 """
@@ -83,12 +82,7 @@ Remove punctuations.
 function punctuation(ps)
     ispunct(ps[]) || return false
 
-    if ps.idx > 1 && isspace(ps[ps.idx - 1])
-        deleteat!(ps, ps.idx - 1:ps.idx)
-    else
-        deleteat!(ps, ps.idx)
-    end
-
+    deleteat!(ps.input, ps.idx)
     return true
 end
 
@@ -100,12 +94,7 @@ Removes all numbers.
 function numbers(ps)
     isdigit(ps[]) || return false
 
-    if ps.idx > 1 && isspace(ps[ps.idx - 1])
-        deleteat!(ps, ps.idx - 1:ps.idx)
-    else
-        deleteat!(ps, ps.idx)
-    end
-
+    deleteat!(ps.input, ps.idx)
     return true
 end
 
@@ -148,6 +137,7 @@ function next_token(ps::PreprocessBuffer, ws)
     while i <= length(ps.input) && isletter(ps[i])
         i += 1
     end
+    i < length(ps.input) && isdigit(ps[i]) && return false, i
 
     String(ps.input[ps.idx:i-1]) ∈ ws && return true, i
     return false, i
@@ -171,14 +161,9 @@ function words_remove(ps, ws)
     match, i = next_token(ps, ws)
 
     if match == false
-        ps.idx = ps.idx + i
+        ps.idx = i
     else
-        if ps.idx > 1 && isspace(ps[ps.idx - 1])
-            ps.idx -= 1
-        end
-
         deleteat!(ps.input, ps.idx:i - 1)
-        ps.idx += 1
     end
 
     return true
@@ -186,6 +171,7 @@ end
 
 function next(ps::PreprocessBuffer)
     ps.idx += 1
+    return true
 end
 
 # ws of type Sorted Set
@@ -204,13 +190,14 @@ function fastpreprocess(txt::String, lang)
     # TODO: Check case insensitive in words
 
     while !isdone(ps)
+        whitespace(ps) ||
         corrupt_utf8(ps) ||
         punctuation(ps) ||
         numbers(ps) ||
         words_remove(ps, ws) || next(ps)
     end
 
-    trailing_whitespace(ps)
+    # trailing_whitespace(ps)
 
     return String(ps.input)
 end

From 851e746c1d522b63f6b45cdaf55ee29d8771722e Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 24 Jun 2019 21:06:11 +0530
Subject: [PATCH 11/14] Add docstrings for fastpreprocessing.jl

---
 src/fastpreprocess.jl | 95 +++++++++++++++++++++++++------------------
 1 file changed, 55 insertions(+), 40 deletions(-)

diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index b95f37b9..23309e0f 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -1,13 +1,12 @@
-# TODO
-# * strip_sparse_terms
-# * strip_frequent_terms
+# TODO Figure out the following:
+# * strip_sparse_terms - to utilize `words_remove` and `sparse_terms` (of preprocessing.jl).
+# * strip_frequent_terms - to utilize `words_remove` and `frequent_terms` (of preprocessing.jl).
 # * strip_html_tags
 # * strip_non_letters
+# * strip_case
 """
 Preprocessing functions
 
-* strip_case
-
 * corrupt_utf8
 * whitespace
 * punctuation
@@ -18,10 +17,28 @@ Preprocessing functions
 * stopwords
 * prepositions
 * pronouns
+
+
+Turns a string into a readable and writable stream,
+used for preprocessing and flushing out the processed text.
+
+Utility functions (lexers) such as `spaces` and `number` read characters from the stream
+and match against it.
+
+Functions (lexers) return `true` or `false` to indicate whether they matched anything
+in the input stream. They can therefore be combined easily, e.g.
+
+    spacesornumber(ts) = whtiespace(ts) || numbers(ts)
+
+either deletes two consectutively read whitespaces or removes a number character, if matched.
+
+For certain cases like `strip_pronouns`, `strip_prepositions`, `strip_stopwords`, etc.
+These are stored into a `SortedSet` for faster preprocessing and
+matches words / tokens against the characters in the stream
+in the function `words_remove`.
 """
 mutable struct PreprocessBuffer
     input::Vector{Char}
-    # buffer::Vector{Char}
     idx::Int
 end
 
@@ -63,6 +80,11 @@ function whitespace(ps)
     # If prev is whitespace then delete.
 end
 
+"""
+    trailing_whitespace(ps::PreprocessBuffer)
+
+Remove the whitespaces at the end of the input stream.
+"""
 function trailing_whitespace(ps)
     isspace(ps[length(ps.input)]) || return
     i = length(ps.input) - 1
@@ -77,7 +99,7 @@ end
 """
     punctuation(ps::PreprocessBuffer)
 
-Remove punctuations.
+Remove punctuations, as matched by `ispunct`.
 """
 function punctuation(ps)
     ispunct(ps[]) || return false
@@ -98,39 +120,10 @@ function numbers(ps)
     return true
 end
 
-# """
-#     lookahead(::PreprocessBuffer, s; boundary = false)
-#
-# Peek at the input to see if `s` is coming up next. `boundary` specifies whether
-# a word boundary should follow `s`.
-#
-# ```
-# julia> lookahead(PreprocessBuffer("foo bar"), "foo")
-# true
-# julia> lookahead(PreprocessBuffer("foo bar"), "bar")
-# false
-# julia> lookahead(PreprocessBuffer("foo bar"), "foo", boundary = true)
-# true
-# julia> lookahead(PreprocessBuffer("foobar"), "foo", boundary = true)
-# false
-# ```
-# """
-# function lookahead(ps::PreprocessBuffer, s; boundary = false)
-#     ps.idx + length(s) - 1 > length(ps.input) && return false
-#
-#     for j = 1:length(s)
-#         ps.input[ps.idx - 1 + j] == s[j] || return false
-#     end
-#     if boundary
-#         next = ps.idx + length(s)
-#         next > length(ps.input) && return true
-#         (isletter(ps[next]) || ps[next] == '-') && return false
-#     end
-#     return true
-# end
-
 """
 Helper function for words_remove.
+Matches the next token in the stream against the `ws::SortedSet`.
+Returns whether it matched and the idx of the token end
 """
 function next_token(ps::PreprocessBuffer, ws)
     i = ps.idx
@@ -174,7 +167,22 @@ function next(ps::PreprocessBuffer)
     return true
 end
 
-# ws of type Sorted Set
+"""
+Preprocessing functions
+
+* strip_case
+
+* corrupt_utf8
+* whitespace
+* punctuation
+* numbers
+* indefinite_articles
+* definite_articles
+* articles
+* stopwords
+* prepositions
+* pronouns
+"""
 function fastpreprocess(txt::String, lang)
     length(txt) < 1 && return
 
@@ -185,6 +193,13 @@ function fastpreprocess(txt::String, lang)
     pron = pronouns(lang)
 
     ws = SortedSet(vcat(indef_a, def_a, stop, prepo, pron))
+
+    return fastpreprocess(txt, ws)
+end
+
+function fastpreprocess(txt::String, ws::SortedSet)
+    length(txt) < 1 && return
+
     ps = PreprocessBuffer(txt)
 
     # TODO: Check case insensitive in words
@@ -197,7 +212,7 @@ function fastpreprocess(txt::String, lang)
         words_remove(ps, ws) || next(ps)
     end
 
-    # trailing_whitespace(ps)
+    trailing_whitespace(ps)
 
     return String(ps.input)
 end

From da4a64ca9acdac3a2c0969a696113b9573bc98c3 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 24 Jun 2019 22:08:51 +0530
Subject: [PATCH 12/14] Add support for preprocessing over Corpus and Docs

---
 src/fastpreprocess.jl | 65 ++++++++++++++++++++++++++++---------------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index 23309e0f..1540deec 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -19,8 +19,8 @@ Preprocessing functions
 * pronouns
 
 
-Turns a string into a readable and writable stream,
-used for preprocessing and flushing out the processed text.
+Turns a string into a readable and writable stream of `Char`s,
+used for preprocessing and flushing out the processes text.
 
 Utility functions (lexers) such as `spaces` and `number` read characters from the stream
 and match against it.
@@ -168,33 +168,48 @@ function next(ps::PreprocessBuffer)
 end
 
 """
-Preprocessing functions
+    fastpreprocess(::StringDocument, flags)
+    fastpreprocess(::Corpus, flags)
+    fastpreprocess(::String, lang::T, flags) where T <: Language
+    fastpreprocess(::String, ::SortedSet, flags)
 
-* strip_case
+## Preprocessing functions currently available
 
 * corrupt_utf8
 * whitespace
 * punctuation
 * numbers
-* indefinite_articles
-* definite_articles
-* articles
-* stopwords
-* prepositions
-* pronouns
+
+### Flags for functions requiring `words_remove`
+
+* strip_indefinite_articles
+* strip_definite_articles
+* strip_articles
+* strip_stopwords
+* strip_prepositions
+* strip_pronouns
+
+## Usage
+
+
+## Note:
+
+This does not work for Corpora consisting of `FileDocument`,
+`TokenDocument` or `NGramDocument`
+
 """
-function fastpreprocess(txt::String, lang)
-    length(txt) < 1 && return
+fastpreprocess(txt::String, lang, flags) = fastpreprocess(txt, build_set(flags, lang))
 
-    indef_a = indefinite_articles(lang)
-    def_a = definite_articles(lang)
-    stop = stopwords(lang)
-    prepo = prepositions(lang)
-    pron = pronouns(lang)
+function build_set(flags, lang)
+    ws = SortedSet()
 
-    ws = SortedSet(vcat(indef_a, def_a, stop, prepo, pron))
+    ((flags & strip_indefinite_articles) > 0) && union!(ws, indefinite_articles(lang))
+    ((flags & strip_definite_articles) > 0) && union!(ws, definite_articles(lang))
 
-    return fastpreprocess(txt, ws)
+    ((flags & strip_prepositions) > 0) && union!(ws, prepositions(lang))
+    ((flags & strip_pronouns) > 0) && union!(ws, pronouns(lang))
+    ((flags & strip_stopwords) > 0) && union!(ws, stopwords(lang))
+    ws
 end
 
 function fastpreprocess(txt::String, ws::SortedSet)
@@ -217,12 +232,18 @@ function fastpreprocess(txt::String, ws::SortedSet)
     return String(ps.input)
 end
 
-function fastpreprocess(doc::StringDocument)
-    doc.text =  fastpreprocess(doc.text, Languages.English())
-    nothing
+function fastpreprocess(doc::StringDocument, flags)
+    doc.text =  fastpreprocess(doc.text, build_set(flags, lang(doc)))
 end
 
 # Only for String Document
 function fastpreprocess(crps::Corpus, flags)
+    ws = build_set(flags, lang(crps[1]))
+
+    for doc in crps
+        doc.text = fastpreprocess(doc.text, ws)))
+    end
+    crps
 end
+
 # HTML placed before words

From 8bc5f933db5914f4e5cd76dce8ac9ffa66420d1b Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 24 Jun 2019 22:39:42 +0530
Subject: [PATCH 13/14] Add tests for PreprocessBuffer.

---
 test/fastpreprocess.jl | 36 ++++++++++++++++++++++++++++++++++++
 test/runtests.jl       |  1 +
 2 files changed, 37 insertions(+)
 create mode 100644 test/fastpreprocess.jl

diff --git a/test/fastpreprocess.jl b/test/fastpreprocess.jl
new file mode 100644
index 00000000..c174a661
--- /dev/null
+++ b/test/fastpreprocess.jl
@@ -0,0 +1,36 @@
+@testset "Preprocessing" begin
+    @testset "Words Removal" begin
+        doc = StringDocument("this is a the sample text")
+        fastpreprocess(doc, strip_articles)
+        @test isequal(doc.text, "this is sample text")
+
+        doc = Document("this is the sample text")
+        fastpreprocess(doc, strip_definite_articles)
+        @test isequal(doc.text, "this is sample text")
+
+        doc = Document("this is a sample text")
+        fastpreprocess(doc, strip_indefinite_articles)
+        @test isequal(doc.text, "this is sample text")
+
+        doc = Document("this is on sample text")
+        fastpreprocess(doc, strip_prepositions)
+        @test isequal(doc.text, "this is sample text")
+
+        doc = Document("this is my sample text")
+        fastpreprocess(doc, strip_pronouns)
+        @test isequal(doc.text, "this is sample text")
+
+        doc = Document("this is sample text")
+        fastpreprocess(doc, strip_stopwords)
+        @test isequal(strip(doc.text), "sample text")
+    end
+
+    # test Remove Corrupt UT8
+    sd = StringDocument("abc")
+    fastpreprocess(sd)
+    @test sd.text == "abc"
+
+    sd = StringDocument(String([0x43, 0xf0]))
+    fastpreprocess(sd)
+    @test sd.text == "C"
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index bffc62d2..c251bdbd 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -15,6 +15,7 @@ include("document.jl")
 include("metadata.jl")
 include("corpus.jl")
 include("preprocessing.jl")
+include("fastpreprocess.jl")
 include("dtm.jl")
 include("stemmer.jl")
 include("tf_idf.jl")

From c81397281542a3bbbff8589d8a7b88801712d441 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 24 Jun 2019 22:40:18 +0530
Subject: [PATCH 14/14] Optional args in fastpreproces

---
 src/TextAnalysis.jl   |  1 +
 src/fastpreprocess.jl | 21 +++++++++------------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 261d0ced..cdc761b3 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -46,6 +46,7 @@ module TextAnalysis
     export tf, tf_idf, lsa, lda, summarize
     export tf!, tf_idf!, lsa!, lda!
     export remove_patterns!, remove_patterns
+    export fastpreprocess, PreprocessBuffer
 
     export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation
     export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
diff --git a/src/fastpreprocess.jl b/src/fastpreprocess.jl
index 1540deec..faf97697 100644
--- a/src/fastpreprocess.jl
+++ b/src/fastpreprocess.jl
@@ -59,7 +59,7 @@ Removes the corrupt UTF8 chars.
 function corrupt_utf8(ps)
     isvalid(ps[ps.idx]) && return false
 
-    deleteat!(ps, ps.idx)
+    deleteat!(ps.input, ps.idx)
     return true
 end
 
@@ -198,9 +198,9 @@ This does not work for Corpora consisting of `FileDocument`,
 `TokenDocument` or `NGramDocument`
 
 """
-fastpreprocess(txt::String, lang, flags) = fastpreprocess(txt, build_set(flags, lang))
+fastpreprocess(txt::String, lang = Languages.English(), flags = 0) = fastpreprocess(txt, build_set(flags, lang))
 
-function build_set(flags, lang)
+function build_set(flags, lang = Languages.English())
     ws = SortedSet()
 
     ((flags & strip_indefinite_articles) > 0) && union!(ws, indefinite_articles(lang))
@@ -212,13 +212,11 @@ function build_set(flags, lang)
     ws
 end
 
+# TODO: Check case insensitive in words
 function fastpreprocess(txt::String, ws::SortedSet)
     length(txt) < 1 && return
-
     ps = PreprocessBuffer(txt)
 
-    # TODO: Check case insensitive in words
-
     while !isdone(ps)
         whitespace(ps) ||
         corrupt_utf8(ps) ||
@@ -228,20 +226,19 @@ function fastpreprocess(txt::String, ws::SortedSet)
     end
 
     trailing_whitespace(ps)
-
     return String(ps.input)
 end
 
-function fastpreprocess(doc::StringDocument, flags)
-    doc.text =  fastpreprocess(doc.text, build_set(flags, lang(doc)))
+function fastpreprocess(doc::StringDocument, flags = 0)
+    doc.text =  fastpreprocess(doc.text, build_set(flags, language(doc)))
 end
 
 # Only for String Document
-function fastpreprocess(crps::Corpus, flags)
-    ws = build_set(flags, lang(crps[1]))
+function fastpreprocess(crps::Corpus, flags = 0)
+    ws = build_set(flags, language(crps[1]))
 
     for doc in crps
-        doc.text = fastpreprocess(doc.text, ws)))
+        doc.text = fastpreprocess(doc.text, ws)
     end
     crps
 end