{-# LANGUAGE ScopedTypeVariables #-}
{-|

Easy regular expression helpers, currently based on regex-tdfa. These should:

- be cross-platform, not requiring C libraries

- support unicode

- support extended regular expressions

- support replacement, with backreferences etc.

- support splitting

- have mnemonic names

- have simple monomorphic types

- work with simple strings

Regex strings are automatically compiled into regular expressions the
first time they are seen, and these are cached. If you use a huge
number of unique regular expressions this might lead to increased
memory usage. Several functions have memoised variants (*Memo), which
also trade space for time.

Current limitations:

- (?i) and similar are not supported

-}

module Hledger.Utils.Regex (
   -- * type aliases
   Regexp
  ,Replacement
   -- * standard regex operations
  ,regexMatches
  ,regexMatchesCI
  ,regexReplace
  ,regexReplaceCI
  ,regexReplaceMemo
  ,regexReplaceCIMemo
  ,regexReplaceBy
  ,regexReplaceByCI
  )
where

import Data.Array
import Data.Char
import Data.List (foldl')
import Data.Maybe (fromMaybe)
import Data.MemoUgly (memo)
import Text.Regex.TDFA (
  Regex, CompOption(..), ExecOption(..), defaultCompOpt, defaultExecOpt,
  makeRegexOptsM, AllMatches(getAllMatches), match, (=~), MatchText
  )

import Hledger.Utils.UTF8IOCompat (error')


-- | Regular expression. Extended regular expression-ish syntax ? But does not support eg (?i) syntax.
type Regexp = String

-- | A replacement pattern. May include numeric backreferences (\N).
type Replacement = String

-- | Convert our string-based Regexp to a real Regex.
-- Or if it's not well formed, call error with a "malformed regexp" message.
toRegex :: Regexp -> Regex
toRegex :: Regexp -> Regex
toRegex = (Regexp -> Regex) -> Regexp -> Regex
forall a b. Ord a => (a -> b) -> a -> b
memo (CompOption -> ExecOption -> Regexp -> Regex
compileRegexOrError CompOption
forall regex compOpt execOpt.
RegexOptions regex compOpt execOpt =>
compOpt
defaultCompOpt ExecOption
forall regex compOpt execOpt.
RegexOptions regex compOpt execOpt =>
execOpt
defaultExecOpt)

-- | Like toRegex but make a case-insensitive Regex.
toRegexCI :: Regexp -> Regex
toRegexCI :: Regexp -> Regex
toRegexCI = (Regexp -> Regex) -> Regexp -> Regex
forall a b. Ord a => (a -> b) -> a -> b
memo (CompOption -> ExecOption -> Regexp -> Regex
compileRegexOrError CompOption
forall regex compOpt execOpt.
RegexOptions regex compOpt execOpt =>
compOpt
defaultCompOpt{caseSensitive :: Bool
caseSensitive=Bool
False} ExecOption
forall regex compOpt execOpt.
RegexOptions regex compOpt execOpt =>
execOpt
defaultExecOpt)

compileRegexOrError :: CompOption -> ExecOption -> Regexp -> Regex
compileRegexOrError :: CompOption -> ExecOption -> Regexp -> Regex
compileRegexOrError compopt :: CompOption
compopt execopt :: ExecOption
execopt r :: Regexp
r =
  Regex -> Maybe Regex -> Regex
forall a. a -> Maybe a -> a
fromMaybe
  (Regexp -> Regex
forall a. Regexp -> a
errorWithoutStackTrace (Regexp -> Regex) -> Regexp -> Regex
forall a b. (a -> b) -> a -> b
$ "this regular expression could not be compiled: " Regexp -> Regexp -> Regexp
forall a. [a] -> [a] -> [a]
++ Regexp -> Regexp
forall a. Show a => a -> Regexp
show Regexp
r) (Maybe Regex -> Regex) -> Maybe Regex -> Regex
forall a b. (a -> b) -> a -> b
$
  CompOption -> ExecOption -> Regexp -> Maybe Regex
forall regex compOpt execOpt source (m :: * -> *).
(RegexMaker regex compOpt execOpt source, MonadFail m) =>
compOpt -> execOpt -> source -> m regex
makeRegexOptsM CompOption
compopt ExecOption
execopt Regexp
r

-- regexMatch' :: RegexContext Regexp String a => Regexp -> String -> a
-- regexMatch' r s = s =~ (toRegex r)

regexMatches :: Regexp -> String -> Bool
regexMatches :: Regexp -> Regexp -> Bool
regexMatches = (Regexp -> Regexp -> Bool) -> Regexp -> Regexp -> Bool
forall a b c. (a -> b -> c) -> b -> a -> c
flip Regexp -> Regexp -> Bool
forall source source1 target.
(RegexMaker Regex CompOption ExecOption source,
 RegexContext Regex source1 target) =>
source1 -> source -> target
(=~)

regexMatchesCI :: Regexp -> String -> Bool
regexMatchesCI :: Regexp -> Regexp -> Bool
regexMatchesCI r :: Regexp
r = Regex -> Regexp -> Bool
forall regex source target.
RegexContext regex source target =>
regex -> source -> target
match (Regexp -> Regex
toRegexCI Regexp
r)

-- | Replace all occurrences of the regexp, transforming each match with the given function.
regexReplaceBy :: Regexp -> (String -> String) -> String -> String
regexReplaceBy :: Regexp -> (Regexp -> Regexp) -> Regexp -> Regexp
regexReplaceBy r :: Regexp
r = Regex -> (Regexp -> Regexp) -> Regexp -> Regexp
replaceAllBy (Regexp -> Regex
toRegex Regexp
r)

regexReplaceByCI :: Regexp -> (String -> String) -> String -> String
regexReplaceByCI :: Regexp -> (Regexp -> Regexp) -> Regexp -> Regexp
regexReplaceByCI r :: Regexp
r = Regex -> (Regexp -> Regexp) -> Regexp -> Regexp
replaceAllBy (Regexp -> Regex
toRegexCI Regexp
r)

-- | Replace all occurrences of the regexp with the replacement
-- pattern. The replacement pattern supports numeric backreferences
-- (\N) but no other RE syntax.
regexReplace :: Regexp -> Replacement -> String -> String
regexReplace :: Regexp -> Regexp -> Regexp -> Regexp
regexReplace re :: Regexp
re = Regex -> Regexp -> Regexp -> Regexp
replaceRegex (Regexp -> Regex
toRegex Regexp
re)

regexReplaceCI :: Regexp -> Replacement -> String -> String
regexReplaceCI :: Regexp -> Regexp -> Regexp -> Regexp
regexReplaceCI re :: Regexp
re = Regex -> Regexp -> Regexp -> Regexp
replaceRegex (Regexp -> Regex
toRegexCI Regexp
re)

-- | A memoising version of regexReplace. Caches the result for each
-- search pattern, replacement pattern, target string tuple.
regexReplaceMemo :: Regexp -> Replacement -> String -> String
regexReplaceMemo :: Regexp -> Regexp -> Regexp -> Regexp
regexReplaceMemo re :: Regexp
re repl :: Regexp
repl = (Regexp -> Regexp) -> Regexp -> Regexp
forall a b. Ord a => (a -> b) -> a -> b
memo (Regexp -> Regexp -> Regexp -> Regexp
regexReplace Regexp
re Regexp
repl)

regexReplaceCIMemo :: Regexp -> Replacement -> String -> String
regexReplaceCIMemo :: Regexp -> Regexp -> Regexp -> Regexp
regexReplaceCIMemo re :: Regexp
re repl :: Regexp
repl = (Regexp -> Regexp) -> Regexp -> Regexp
forall a b. Ord a => (a -> b) -> a -> b
memo (Regexp -> Regexp -> Regexp -> Regexp
regexReplaceCI Regexp
re Regexp
repl)

--

replaceRegex :: Regex -> Replacement -> String -> String
replaceRegex :: Regex -> Regexp -> Regexp -> Regexp
replaceRegex re :: Regex
re repl :: Regexp
repl s :: Regexp
s = (Regexp -> MatchText Regexp -> Regexp)
-> Regexp -> [MatchText Regexp] -> Regexp
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl (Regexp -> Regexp -> MatchText Regexp -> Regexp
replaceMatch Regexp
repl) Regexp
s ([MatchText Regexp] -> [MatchText Regexp]
forall a. [a] -> [a]
reverse ([MatchText Regexp] -> [MatchText Regexp])
-> [MatchText Regexp] -> [MatchText Regexp]
forall a b. (a -> b) -> a -> b
$ Regex -> Regexp -> [MatchText Regexp]
forall regex source target.
RegexContext regex source target =>
regex -> source -> target
match Regex
re Regexp
s :: [MatchText String])

replaceMatch :: Replacement -> String -> MatchText String -> String
replaceMatch :: Regexp -> Regexp -> MatchText Regexp -> Regexp
replaceMatch replpat :: Regexp
replpat s :: Regexp
s matchgroups :: MatchText Regexp
matchgroups = Regexp
pre Regexp -> Regexp -> Regexp
forall a. [a] -> [a] -> [a]
++ Regexp
repl Regexp -> Regexp -> Regexp
forall a. [a] -> [a] -> [a]
++ Regexp
post
  where
    ((_,(off :: MatchOffset
off,len :: MatchOffset
len)):_) = MatchText Regexp -> [(Regexp, (MatchOffset, MatchOffset))]
forall i e. Array i e -> [e]
elems MatchText Regexp
matchgroups  -- groups should have 0-based indexes, and there should always be at least one, since this is a match
    (pre :: Regexp
pre, post' :: Regexp
post') = MatchOffset -> Regexp -> (Regexp, Regexp)
forall a. MatchOffset -> [a] -> ([a], [a])
splitAt MatchOffset
off Regexp
s
    post :: Regexp
post = MatchOffset -> Regexp -> Regexp
forall a. MatchOffset -> [a] -> [a]
drop MatchOffset
len Regexp
post'
    repl :: Regexp
repl = Regex -> (Regexp -> Regexp) -> Regexp -> Regexp
replaceAllBy (Regexp -> Regex
toRegex "\\\\[0-9]+") (MatchText Regexp -> Regexp -> Regexp
replaceBackReference MatchText Regexp
matchgroups) Regexp
replpat

replaceBackReference :: MatchText String -> String -> String
replaceBackReference :: MatchText Regexp -> Regexp -> Regexp
replaceBackReference grps :: MatchText Regexp
grps ('\\':s :: Regexp
s@(_:_)) | (Char -> Bool) -> Regexp -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all Char -> Bool
isDigit Regexp
s =
  case Regexp -> MatchOffset
forall a. Read a => Regexp -> a
read Regexp
s of n :: MatchOffset
n | MatchOffset
n MatchOffset -> [MatchOffset] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` MatchText Regexp -> [MatchOffset]
forall i e. Ix i => Array i e -> [i]
indices MatchText Regexp
grps -> (Regexp, (MatchOffset, MatchOffset)) -> Regexp
forall a b. (a, b) -> a
fst (MatchText Regexp
grps MatchText Regexp
-> MatchOffset -> (Regexp, (MatchOffset, MatchOffset))
forall i e. Ix i => Array i e -> i -> e
! MatchOffset
n)
                 _                         -> Regexp -> Regexp
forall a. Regexp -> a
error' (Regexp -> Regexp) -> Regexp -> Regexp
forall a b. (a -> b) -> a -> b
$ "no match group exists for backreference \"\\"Regexp -> Regexp -> Regexp
forall a. [a] -> [a] -> [a]
++Regexp
sRegexp -> Regexp -> Regexp
forall a. [a] -> [a] -> [a]
++"\""
replaceBackReference _ s :: Regexp
s = Regexp -> Regexp
forall a. Regexp -> a
error' (Regexp -> Regexp) -> Regexp -> Regexp
forall a b. (a -> b) -> a -> b
$ "replaceBackReference called on non-numeric-backreference \""Regexp -> Regexp -> Regexp
forall a. [a] -> [a] -> [a]
++Regexp
sRegexp -> Regexp -> Regexp
forall a. [a] -> [a] -> [a]
++"\", shouldn't happen"

--

-- http://stackoverflow.com/questions/9071682/replacement-substition-with-haskell-regex-libraries :
-- | Replace all occurrences of a regexp in a string, transforming each match with the given function.
replaceAllBy :: Regex -> (String -> String) -> String -> String
replaceAllBy :: Regex -> (Regexp -> Regexp) -> Regexp -> Regexp
replaceAllBy re :: Regex
re f :: Regexp -> Regexp
f s :: Regexp
s = Regexp -> Regexp
start Regexp
end
  where
    (_, end :: Regexp
end, start :: Regexp -> Regexp
start) = ((MatchOffset, Regexp, Regexp -> Regexp)
 -> (MatchOffset, MatchOffset)
 -> (MatchOffset, Regexp, Regexp -> Regexp))
-> (MatchOffset, Regexp, Regexp -> Regexp)
-> [(MatchOffset, MatchOffset)]
-> (MatchOffset, Regexp, Regexp -> Regexp)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (MatchOffset, Regexp, Regexp -> Regexp)
-> (MatchOffset, MatchOffset)
-> (MatchOffset, Regexp, Regexp -> Regexp)
forall c.
(MatchOffset, Regexp, Regexp -> c)
-> (MatchOffset, MatchOffset) -> (MatchOffset, Regexp, Regexp -> c)
go (0, Regexp
s, Regexp -> Regexp
forall a. a -> a
id) ([(MatchOffset, MatchOffset)]
 -> (MatchOffset, Regexp, Regexp -> Regexp))
-> [(MatchOffset, MatchOffset)]
-> (MatchOffset, Regexp, Regexp -> Regexp)
forall a b. (a -> b) -> a -> b
$ (AllMatches [] (MatchOffset, MatchOffset)
-> [(MatchOffset, MatchOffset)]
forall (f :: * -> *) b. AllMatches f b -> f b
getAllMatches (AllMatches [] (MatchOffset, MatchOffset)
 -> [(MatchOffset, MatchOffset)])
-> AllMatches [] (MatchOffset, MatchOffset)
-> [(MatchOffset, MatchOffset)]
forall a b. (a -> b) -> a -> b
$ Regex -> Regexp -> AllMatches [] (MatchOffset, MatchOffset)
forall regex source target.
RegexContext regex source target =>
regex -> source -> target
match Regex
re Regexp
s :: [(Int, Int)])
    go :: (MatchOffset, Regexp, Regexp -> c)
-> (MatchOffset, MatchOffset) -> (MatchOffset, Regexp, Regexp -> c)
go (ind :: MatchOffset
ind,read :: Regexp
read,write :: Regexp -> c
write) (off :: MatchOffset
off,len :: MatchOffset
len) =
      let (skip :: Regexp
skip, start :: Regexp
start) = MatchOffset -> Regexp -> (Regexp, Regexp)
forall a. MatchOffset -> [a] -> ([a], [a])
splitAt (MatchOffset
off MatchOffset -> MatchOffset -> MatchOffset
forall a. Num a => a -> a -> a
- MatchOffset
ind) Regexp
read
          (matched :: Regexp
matched, remaining :: Regexp
remaining) = MatchOffset -> Regexp -> (Regexp, Regexp)
forall a. MatchOffset -> [a] -> ([a], [a])
splitAt MatchOffset
len Regexp
start
      in (MatchOffset
off MatchOffset -> MatchOffset -> MatchOffset
forall a. Num a => a -> a -> a
+ MatchOffset
len, Regexp
remaining, Regexp -> c
write (Regexp -> c) -> (Regexp -> Regexp) -> Regexp -> c
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Regexp
skipRegexp -> Regexp -> Regexp
forall a. [a] -> [a] -> [a]
++) (Regexp -> Regexp) -> (Regexp -> Regexp) -> Regexp -> Regexp
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Regexp -> Regexp
f Regexp
matched Regexp -> Regexp -> Regexp
forall a. [a] -> [a] -> [a]
++))