1
1
mirror of https://github.com/tweag/asterius.git synced 2024-10-26 09:21:04 +03:00

Remove text shims

This commit is contained in:
Cheng Shao 2020-10-17 13:51:22 +00:00
parent 4dddcb3c98
commit ad42769975
124 changed files with 2 additions and 17801 deletions

View File

@ -73,12 +73,6 @@ ahc-cabal act-as-setup --build-type=Simple -- build -j --builddir=$ASTERIUS_TMP_
ahc-cabal act-as-setup --build-type=Simple -- install --builddir=$ASTERIUS_TMP_DIR/dist/ghci
popd
pushd text
ahc-cabal act-as-setup --build-type=Simple -- configure --builddir=$ASTERIUS_TMP_DIR/dist/text $ASTERIUS_CONFIGURE_OPTIONS
ahc-cabal act-as-setup --build-type=Simple -- build -j --builddir=$ASTERIUS_TMP_DIR/dist/text
ahc-cabal act-as-setup --build-type=Simple -- install --builddir=$ASTERIUS_TMP_DIR/dist/text
popd
ahc-cabal v1-install $ASTERIUS_CONFIGURE_OPTIONS \
aeson \
parsec

View File

@ -11,7 +11,6 @@ import { StableNameManager } from "./rts.stablename.mjs";
import { StaticPtrManager } from "./rts.staticptr.mjs";
import { Scheduler } from "./rts.scheduler.mjs";
import { IntegerManager } from "./rts.integer.mjs";
import { TextCBits } from "./rts.text.mjs";
import { TimeCBits } from "./rts.time.mjs";
import { GC } from "./rts.gc.mjs";
import { ExceptionHelper } from "./rts.exception.mjs";
@ -103,7 +102,6 @@ export async function newAsteriusInstance(req) {
__asterius_stableptr_manager
),
__asterius_integer_manager = new IntegerManager(),
__asterius_text_cbits = new TextCBits(__asterius_memory),
__asterius_time_cbits = new TimeCBits(__asterius_memory, req.targetSpecificModule),
__asterius_gc = new GC(
__asterius_memory,
@ -181,7 +179,6 @@ export async function newAsteriusInstance(req) {
write: (fd, buf, count) => __asterius_fs.write(fd, buf, count)
},
posix: modulify(new (req.targetSpecificModule.posix)(__asterius_memory, rtsConstants)),
text: modulify(__asterius_text_cbits),
time: modulify(__asterius_time_cbits),
// cannot name this float since float is a keyword.
floatCBits: modulify(__asterius_float_cbits),

View File

@ -1,64 +0,0 @@
import { Memory } from "./rts.memory.mjs";
export class TextCBits {
constructor(memory) {
this.memory = memory;
Object.seal(this);
}
_hs_text_memcpy(_dst, dst_off, _src, src_off, n) {
const dst = Memory.unTag(_dst) + dst_off * 2,
src = Memory.unTag(_src) + src_off * 2;
this.memory.i8View.copyWithin(dst, src, src + n * 2);
}
_hs_text_memcmp(_dst, dst_off, _src, src_off, n) {
const dst = Memory.unTag(_dst) + dst_off * 2,
src = Memory.unTag(_src) + src_off * 2;
for (let i = 0; i < n * 2; ++i) {
const sgn = Math.sign(
this.memory.i8View[dst + i] - this.memory.i8View[src + i]
);
if (sgn) return sgn;
}
return 0;
}
_hs_text_decode_utf8(dest, destoffp, src, srcend) {
const dec = new TextDecoder("utf-8", { fatal: true }),
s = dec.decode(
this.memory.i8View.subarray(Memory.unTag(src), Memory.unTag(srcend))
);
for (let i = 0; i < s.length; ++i)
this.memory.i16Store(dest + i * 2, s.charCodeAt(i));
this.memory.i64Store(destoffp, s.length);
return srcend;
}
_hs_text_encode_utf8(destp, src, srcoff, srclen) {
const dec = new TextDecoder("utf-16le", { fatal: true }),
s = dec.decode(
this.memory.i8View.subarray(
Memory.unTag(src + srcoff * 2),
Memory.unTag(src + srcoff * 2 + srclen * 2)
)
),
dest = Number(this.memory.i64Load(destp)),
enc = new TextEncoder(),
// `Data.Text.Encoding.encodeUtf8` allocates a `ByteArray#` of size
// `srclen * 3` to ensure enough space for a single-pass encoding from
// UTF-16 to UTF-8. See the comment near
// `https://github.com/haskell/text/blob/2176eb38b5238e763e8076b0d0db8c2f2014ab8b/Data/Text/Encoding.hs#L432`
// and the "Buffer Sizing" section of
// `https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encodeInto`
// for details.
l = enc.encodeInto(
s,
this.memory.i8View.subarray(
Memory.unTag(dest),
Memory.unTag(dest + srclen * 3)
)
).written;
this.memory.i64Store(destp, dest + l);
}
}

View File

@ -136,9 +136,7 @@ rtsAsteriusModule opts =
SM.fromList $
map
(\(func_sym, (_, func)) -> (func_sym, func))
( floatCBits
<> textCBits
)
floatCBits
}
<> hsInitFunction opts
<> wasmApplyRelocsFunction opts
@ -615,8 +613,7 @@ rtsFunctionImports debug =
)
<> map
(fst . snd)
( floatCBits <> textCBits
)
floatCBits
<> schedulerImports
<> exportsImports
<> envImports
@ -711,20 +708,6 @@ rtsGlobalExports = mempty
emitErrorMessage :: [ValueType] -> BS.ByteString -> Expression
emitErrorMessage vts ev = Barf {barfMessage = ev, barfReturnTypes = vts}
textCBits :: [(EntitySymbol, (FunctionImport, Function))]
textCBits =
map
( \(func_sym, param_vts, ret_vts) ->
( mkEntitySymbol func_sym,
generateRTSWrapper "text" func_sym param_vts ret_vts
)
)
[ ("_hs_text_memcpy", [I64, I64, I64, I64, I64], []),
("_hs_text_memcmp", [I64, I64, I64, I64, I64], [I64]),
("_hs_text_decode_utf8", [I64, I64, I64, I64], [I64]),
("_hs_text_encode_utf8", [I64, I64, I64, I64], [])
]
floatCBits :: [(EntitySymbol, (FunctionImport, Function))]
floatCBits =
map

View File

@ -1,10 +0,0 @@
/dist/
/dist-boot/
/dist-install/
/dist-newstyle/
/cabal-dev/
/cabal.sandbox.config
/ghc.mk
/GNUmakefile
/.ghc.environment.*
/cabal.project.local

View File

@ -1,14 +0,0 @@
^(?:dist|benchmarks/dist|tests/coverage|tests/dist)$
^benchmarks/.*\.txt$
^tests/text-testdata.tar.bz2$
^tests/(?:\.hpc|bm|qc|qc-hpc|stdio-hpc|text/test)$
\.(?:aux|eventlog|h[ip]|log|[oa]|orig|prof|ps|rej|swp|tix)$
~$
syntax: glob
.\#*
cabal-dev
cabal.sandbox.config
\.cabal-sandbox
scripts/CaseFolding.txt
scripts/SpecialCasing.txt

View File

@ -1,67 +0,0 @@
0edd5dc96024e6c0f56f175ffa673f9178cece8c 0.9.0.0
1de0da99d31ba212e179d88957cf5f3b9c4facd7 0.7.0.1
30575507578cf04b5b1501ee23b1faa83315eb88 0.1
321e61f17630dbd346c1c9991831572f964b7d5a 0.4
34cef27f87287cd3c17d78ac99e9de2d6c8a1200 0.6
3cbff6d9bca135a85e428a7f6c4e15739a562b8b 0.7.1.0
587c8661393f83cb2b9452f6cc1d6885e52fe544 0.8.1.0
61c322bd5b6446255870422f854266396169a7e5 0.5
9adb5edd9b6407335e8772ea86036b938471fc89 0.2
b4fdbef1c1027a70b95c1a81747d968890acc3f6 0.3
de7cb0cb2a72606126c357b860bb0dbf7f145ad2 0.9.0.1
f2c9b1c019e3277926378d7b5a604f1c995f0202 0.7
f3d625f2d0730cb6adf8506a1f37762178c9aea6 0.10.0.0
99ea36957b1d47145d70f2ee8b13f1ddfd09245c 0.10.0.1
1b7889216e78b00d69b32c48782111bcd935978d 0.10.0.2
75157eaec6eb38574ad5c347e75c48dcb7262dee 0.11.0.0
0d7bc7bd77ec5a8f6d064de59d459e5eaf906b0f 0.11.0.1
a9d582a75dea9a4417883019d5110a35e4c07265 0.11.0.2
36b2a3aa661892ea25101199220383ba6d579abb 0.11.0.3
06e407bcdc9a3905abf9551062ecc6f92abcdde5 0.11.0.4
94bcf16e812f6fb49b87598673e17270f48c2bf1 0.11.0.5
784a32ffbec0b77adb43e9a64328db0482007309 0.11.0.6
c808123a3d37f25004d2aad8f3a395acfcf9122f 0.11.0.7
679826e742b239c5dd0c1fa4ebe98aa4bfe02a00 0.11.0.8
b49eebfa5673da89a23939d523c9de9edbd410b0 0.11.1.0
9c820a2ecaea01bc614d8e6bccf90e57431bdcbb 0.11.1.1
ed3a60ec627af6ba8af321009aa00224b3296e47 0.11.1.2
b75d3041d275e8e76b26605b0d426a572ddf1249 0.11.1.3
53906ad0c7e64f6c9a5df569e7cc642579aab8fc 0.11.1.5
9d6d3a9690ade506897c072060eb92868e229d5c 0.11.1.6
5ac062eace36b333566d1676979a57039a188840 0.11.1.7
9f01361a73071936b6ab0bdb879f1a45cca6577a 0.11.1.8
5dce2a934be53e74310fedffb51d7bd3e61fa993 0.11.1.9
9f47a2cfc9e51fd622d7553f08ad2ac1faad0438 0.11.1.10
9f47a2cfc9e51fd622d7553f08ad2ac1faad0438 0.11.1.10
407937739e9e764f1ae0f1f9ca454c42dca38772 0.11.1.10
8b981edd27befa4c2dd334fcb7db22ac67e22b67 0.11.1.11
204da16b5098531bdf858c388e2620238ef2aa5e 0.11.1.12
6a3d847a56a69d0514a79cb212cb218271ad0917 0.11.1.13
1d2c6fa9092c6a4000b2abdd9d01f3efcd477be5 0.11.2.0
78219784cf3652cc662805bf2971bd62d80210a9 0.11.2.1
4297307ebc11ad677cfba6b40319e7e5e2c0cfee 0.11.2.3
7fa79662b66aade97fe49394977213fe6432942e 0.11.3.0
d99cd091cdf71ce807a4255f6cc509c3154f51ea 0.11.3.1
31f57612fd425d1bb56352ce6b79e41ce6699a45 1.0.0.0
7cba97c86467c1881dfced68613891f3c65561b0 1.0.0.1
62674a9bbc83145d1d7be28a05d456ab6bb5ff89 1.1.0.0
68a32d80c0829da164d1e6b3acedcc3efeda0957 1.1.0.1
6d863daafbf0e1289ec0a0eb9b4be2a0a9cb8d3d 1.1.1.0
fb7299f22449750c653ac64dee4bd63c13ea6e4c 1.1.1.1
eb170cb533b63534d166b5fd5344b36676adf140 1.1.1.2
e749e05f715d02445803f653cd3d97d5395d7c6f 1.1.1.3
b0dff8ca1142595bab489a768ca62e6a6805aac4 1.1.1.4
f8a60279237c1c28e094868740d9188dd94fcc9e 1.2.0.0
6231451979b2d33e28038b75f2dab19c9731f5c3 1.2.0.1
99a3503b4ad001fc1894e90286e96d7dea1e9195 1.2.0.2
47f138edf15576b9adaeed64784e30811099f202 1.2.0.3
0b064b0c69106bbcd13cb0099e72ae5fa2347bf0 1.2.0.4
8bbb3e222a0037035d893f731b0a23be449f7346 1.2.0.5
18c7f8e19872762b7e28ee1871e72f70ea54e3cd 1.2.0.6
42f751b87b4a187db93f595a8166d51746ff57d4 1.2.1.0
78227a84740b70dcd4cf7a65f3bcf603d87e48e2 1.2.1.1
40f268e6735749ef36f487e1d928eaa957bb431d 1.2.1.2
631fd055e27fa10983e17e9ad36d1b285b85c661 1.2.1.3
8221a68ed5ec37028357df9c6c25f52320e54ad9 1.2.2.0
ce1c5ac65eb9f5580114b4e380092b05a485e95d 1.2.2.1
d72c634a91ffe27ec5cd03921dd53efc8129d8f6 1.2.2.2

View File

@ -1,158 +0,0 @@
# This Travis job script has been generated by a script via
#
# haskell-ci 'text.cabal'
#
# For more information, see https://github.com/haskell-CI/haskell-ci
#
# version: 0.3.20190521
#
language: c
dist: xenial
git:
# whether to recursively clone submodules
submodules: false
cache:
directories:
- $HOME/.cabal/packages
- $HOME/.cabal/store
before_cache:
- rm -fv $CABALHOME/packages/hackage.haskell.org/build-reports.log
# remove files that are regenerated by 'cabal update'
- rm -fv $CABALHOME/packages/hackage.haskell.org/00-index.*
- rm -fv $CABALHOME/packages/hackage.haskell.org/*.json
- rm -fv $CABALHOME/packages/hackage.haskell.org/01-index.cache
- rm -fv $CABALHOME/packages/hackage.haskell.org/01-index.tar
- rm -fv $CABALHOME/packages/hackage.haskell.org/01-index.tar.idx
- rm -rfv $CABALHOME/packages/head.hackage
matrix:
include:
- compiler: ghc-8.6.5
addons: {"apt":{"sources":["hvr-ghc"],"packages":["ghc-8.6.5","cabal-install-2.4"]}}
- compiler: ghc-8.4.4
addons: {"apt":{"sources":["hvr-ghc"],"packages":["ghc-8.4.4","cabal-install-2.4"]}}
- compiler: ghc-8.2.2
addons: {"apt":{"sources":["hvr-ghc"],"packages":["ghc-8.2.2","cabal-install-2.4"]}}
- compiler: ghc-8.0.2
addons: {"apt":{"sources":["hvr-ghc"],"packages":["ghc-8.0.2","cabal-install-2.4"]}}
- compiler: ghc-7.10.3
addons: {"apt":{"sources":["hvr-ghc"],"packages":["ghc-7.10.3","cabal-install-2.4"]}}
- compiler: ghc-7.8.4
addons: {"apt":{"sources":["hvr-ghc"],"packages":["ghc-7.8.4","cabal-install-2.4"]}}
- compiler: ghc-7.6.3
addons: {"apt":{"sources":["hvr-ghc"],"packages":["ghc-7.6.3","cabal-install-2.4"]}}
- compiler: ghc-7.4.2
addons: {"apt":{"sources":["hvr-ghc"],"packages":["ghc-7.4.2","cabal-install-2.4"]}}
- compiler: ghc-7.2.2
addons: {"apt":{"sources":["hvr-ghc"],"packages":["ghc-7.2.2","cabal-install-2.4"]}}
- compiler: ghc-7.0.4
addons: {"apt":{"sources":["hvr-ghc"],"packages":["ghc-7.0.4","cabal-install-2.4"]}}
before_install:
- HC=$(echo "/opt/$CC/bin/ghc" | sed 's/-/\//')
- HCPKG="$HC-pkg"
- unset CC
- CABAL=/opt/ghc/bin/cabal
- CABALHOME=$HOME/.cabal
- export PATH="$CABALHOME/bin:$PATH"
- TOP=$(pwd)
- HCNUMVER=$(( $(${HC} --numeric-version|sed -E 's/([0-9]+)\.([0-9]+)\.([0-9]+).*/\1 * 10000 + \2 * 100 + \3/') ))
- echo $HCNUMVER
- CABAL="$CABAL -vnormal+nowrap+markoutput"
- set -o pipefail
- |
echo 'function blue(s) { printf "\033[0;34m" s "\033[0m " }' >> .colorful.awk
echo 'BEGIN { state = "output"; }' >> .colorful.awk
echo '/^-----BEGIN CABAL OUTPUT-----$/ { state = "cabal" }' >> .colorful.awk
echo '/^-----END CABAL OUTPUT-----$/ { state = "output" }' >> .colorful.awk
echo '!/^(-----BEGIN CABAL OUTPUT-----|-----END CABAL OUTPUT-----)/ {' >> .colorful.awk
echo ' if (state == "cabal") {' >> .colorful.awk
echo ' print blue($0)' >> .colorful.awk
echo ' } else {' >> .colorful.awk
echo ' print $0' >> .colorful.awk
echo ' }' >> .colorful.awk
echo '}' >> .colorful.awk
- cat .colorful.awk
- |
color_cabal_output () {
awk -f $TOP/.colorful.awk
}
- echo text | color_cabal_output
install:
- ${CABAL} --version
- echo "$(${HC} --version) [$(${HC} --print-project-git-commit-id 2> /dev/null || echo '?')]"
- TEST=--enable-tests
- BENCH=--enable-benchmarks
- GHCHEAD=${GHCHEAD-false}
- rm -f $CABALHOME/config
- |
echo "verbose: normal +nowrap +markoutput" >> $CABALHOME/config
echo "remote-build-reporting: anonymous" >> $CABALHOME/config
echo "remote-repo-cache: $CABALHOME/packages" >> $CABALHOME/config
echo "logs-dir: $CABALHOME/logs" >> $CABALHOME/config
echo "world-file: $CABALHOME/world" >> $CABALHOME/config
echo "extra-prog-path: $CABALHOME/bin" >> $CABALHOME/config
echo "symlink-bindir: $CABALHOME/bin" >> $CABALHOME/config
echo "installdir: $CABALHOME/bin" >> $CABALHOME/config
echo "build-summary: $CABALHOME/logs/build.log" >> $CABALHOME/config
echo "store-dir: $CABALHOME/store" >> $CABALHOME/config
echo "install-dirs user" >> $CABALHOME/config
echo " prefix: $CABALHOME" >> $CABALHOME/config
echo "repository hackage.haskell.org" >> $CABALHOME/config
echo " url: http://hackage.haskell.org/" >> $CABALHOME/config
- cat $CABALHOME/config
- rm -fv cabal.project cabal.project.local cabal.project.freeze
- travis_retry ${CABAL} v2-update -v
# Generate cabal.project
- rm -rf cabal.project cabal.project.local cabal.project.freeze
- touch cabal.project
- |
echo 'packages: "."' >> cabal.project
- |
echo "write-ghc-environment-files: always" >> cabal.project
- "for pkg in $($HCPKG list --simple-output); do echo $pkg | sed 's/-[^-]*$//' | (grep -vE -- '^(text)$' || true) | sed 's/^/constraints: /' | sed 's/$/ installed/' >> cabal.project.local; done"
- cat cabal.project || true
- cat cabal.project.local || true
- if [ -f "./configure.ac" ]; then (cd "." && autoreconf -i); fi
- ${CABAL} v2-freeze -w ${HC} ${TEST} ${BENCH} | color_cabal_output
- "cat cabal.project.freeze | sed -E 's/^(constraints: *| *)//' | sed 's/any.//'"
- rm cabal.project.freeze
- ${CABAL} v2-build -w ${HC} ${TEST} ${BENCH} --dep -j2 all | color_cabal_output
- ${CABAL} v2-build -w ${HC} --disable-tests --disable-benchmarks --dep -j2 all | color_cabal_output
script:
- DISTDIR=$(mktemp -d /tmp/dist-test.XXXX)
# Packaging...
- ${CABAL} v2-sdist all | color_cabal_output
# Unpacking...
- mv dist-newstyle/sdist/*.tar.gz ${DISTDIR}/
- cp -av th-tests ${DISTDIR}/th-tests.src
- cd ${DISTDIR} || false
- find . -maxdepth 1 -name '*.tar.gz' -exec tar -xvf '{}' \;
# Generate cabal.project
- rm -rf cabal.project cabal.project.local cabal.project.freeze
- touch cabal.project
- |
echo 'packages: "text-*/*.cabal"' >> cabal.project
- |
echo 'packages: ./th-tests.src/' >> cabal.project
- |
echo "write-ghc-environment-files: always" >> cabal.project
- "for pkg in $($HCPKG list --simple-output); do echo $pkg | sed 's/-[^-]*$//' | (grep -vE -- '^(text)$' || true) | sed 's/^/constraints: /' | sed 's/$/ installed/' >> cabal.project.local; done"
- cat cabal.project || true
- cat cabal.project.local || true
# Building...
# this builds all libraries and executables (without tests/benchmarks)
- ${CABAL} v2-build -w ${HC} --disable-tests --disable-benchmarks all | color_cabal_output
# Building with tests and benchmarks...
# build & run tests, build benchmarks
- ${CABAL} v2-build -w ${HC} ${TEST} ${BENCH} all | color_cabal_output
# Testing...
- ${CABAL} v2-test -w ${HC} ${TEST} ${BENCH} all | color_cabal_output
# cabal check...
- (cd text-* && ${CABAL} -vnormal check)
# haddock...
- ${CABAL} v2-haddock -w ${HC} ${TEST} ${BENCH} all | color_cabal_output
# Building without installed constraints for packages in global-db...
- rm -f cabal.project.local
- ${CABAL} v2-build -w ${HC} --disable-tests --disable-benchmarks all | color_cabal_output
# REGENDATA ["text.cabal"]
# EOF

File diff suppressed because it is too large Load Diff

View File

@ -1,249 +0,0 @@
{-# LANGUAGE BangPatterns, CPP, MagicHash, Rank2Types,
RecordWildCards, UnboxedTuples, UnliftedFFITypes #-}
{-# OPTIONS_GHC -fno-warn-unused-matches #-}
-- |
-- Module : Data.Text.Array
-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : portable
--
-- Packed, unboxed, heap-resident arrays. Suitable for performance
-- critical use, both in terms of large data quantities and high
-- speed.
--
-- This module is intended to be imported @qualified@, to avoid name
-- clashes with "Prelude" functions, e.g.
--
-- > import qualified Data.Text.Array as A
--
-- The names in this module resemble those in the 'Data.Array' family
-- of modules, but are shorter due to the assumption of qualified
-- naming.
module Data.Text.Array
(
-- * Types
Array(Array, aBA)
, MArray(MArray, maBA)
-- * Functions
, copyM
, copyI
, empty
, equal
#if defined(ASSERTS)
, length
#endif
, run
, run2
, toList
, unsafeFreeze
, unsafeIndex
, new
, unsafeWrite
) where
#if defined(ASSERTS)
-- This fugly hack is brought by GHC's apparent reluctance to deal
-- with MagicHash and UnboxedTuples when inferring types. Eek!
# define CHECK_BOUNDS(_func_,_len_,_k_) \
if (_k_) < 0 || (_k_) >= (_len_) then error ("Data.Text.Array." ++ (_func_) ++ ": bounds error, offset " ++ show (_k_) ++ ", length " ++ show (_len_)) else
#else
# define CHECK_BOUNDS(_func_,_len_,_k_)
#endif
#include "MachDeps.h"
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
#if MIN_VERSION_base(4,4,0)
import Control.Monad.ST.Unsafe (unsafeIOToST)
#else
import Control.Monad.ST (unsafeIOToST)
#endif
import Data.Bits ((.&.), xor)
import Data.Text.Internal.Unsafe (inlinePerformIO)
import Data.Text.Internal.Unsafe.Shift (shiftL, shiftR)
#if MIN_VERSION_base(4,5,0)
import Foreign.C.Types (CInt(CInt), CSize(CSize))
#else
import Foreign.C.Types (CInt, CSize)
#endif
import GHC.Base (ByteArray#, MutableByteArray#, Int(..),
indexWord16Array#, newByteArray#,
unsafeFreezeByteArray#, writeWord16Array#)
import GHC.ST (ST(..), runST)
import GHC.Word (Word16(..))
import Prelude hiding (length, read)
-- | Immutable array type.
--
-- The 'Array' constructor is exposed since @text-1.1.1.3@
data Array = Array {
aBA :: ByteArray#
#if defined(ASSERTS)
, aLen :: {-# UNPACK #-} !Int -- length (in units of Word16, not bytes)
#endif
}
-- | Mutable array type, for use in the ST monad.
--
-- The 'MArray' constructor is exposed since @text-1.1.1.3@
data MArray s = MArray {
maBA :: MutableByteArray# s
#if defined(ASSERTS)
, maLen :: {-# UNPACK #-} !Int -- length (in units of Word16, not bytes)
#endif
}
#if defined(ASSERTS)
-- | Operations supported by all arrays.
class IArray a where
-- | Return the length of an array.
length :: a -> Int
instance IArray Array where
length = aLen
{-# INLINE length #-}
instance IArray (MArray s) where
length = maLen
{-# INLINE length #-}
#endif
-- | Create an uninitialized mutable array.
new :: forall s. Int -> ST s (MArray s)
new n
| n < 0 || n .&. highBit /= 0 = array_size_error
| otherwise = ST $ \s1# ->
case newByteArray# len# s1# of
(# s2#, marr# #) -> (# s2#, MArray marr#
#if defined(ASSERTS)
n
#endif
#)
where !(I# len#) = bytesInArray n
highBit = maxBound `xor` (maxBound `shiftR` 1)
{-# INLINE new #-}
array_size_error :: a
array_size_error = error "Data.Text.Array.new: size overflow"
-- | Freeze a mutable array. Do not mutate the 'MArray' afterwards!
unsafeFreeze :: MArray s -> ST s Array
unsafeFreeze MArray{..} = ST $ \s1# ->
case unsafeFreezeByteArray# maBA s1# of
(# s2#, ba# #) -> (# s2#, Array ba#
#if defined(ASSERTS)
maLen
#endif
#)
{-# INLINE unsafeFreeze #-}
-- | Indicate how many bytes would be used for an array of the given
-- size.
bytesInArray :: Int -> Int
bytesInArray n = n `shiftL` 1
{-# INLINE bytesInArray #-}
-- | Unchecked read of an immutable array. May return garbage or
-- crash on an out-of-bounds access.
unsafeIndex :: Array -> Int -> Word16
unsafeIndex Array{..} i@(I# i#) =
CHECK_BOUNDS("unsafeIndex",aLen,i)
case indexWord16Array# aBA i# of r# -> (W16# r#)
{-# INLINE unsafeIndex #-}
-- | Unchecked write of a mutable array. May return garbage or crash
-- on an out-of-bounds access.
unsafeWrite :: MArray s -> Int -> Word16 -> ST s ()
unsafeWrite MArray{..} i@(I# i#) (W16# e#) = ST $ \s1# ->
CHECK_BOUNDS("unsafeWrite",maLen,i)
case writeWord16Array# maBA i# e# s1# of
s2# -> (# s2#, () #)
{-# INLINE unsafeWrite #-}
-- | Convert an immutable array to a list.
toList :: Array -> Int -> Int -> [Word16]
toList ary off len = loop 0
where loop i | i < len = unsafeIndex ary (off+i) : loop (i+1)
| otherwise = []
-- | An empty immutable array.
empty :: Array
empty = runST (new 0 >>= unsafeFreeze)
-- | Run an action in the ST monad and return an immutable array of
-- its result.
run :: (forall s. ST s (MArray s)) -> Array
run k = runST (k >>= unsafeFreeze)
-- | Run an action in the ST monad and return an immutable array of
-- its result paired with whatever else the action returns.
run2 :: (forall s. ST s (MArray s, a)) -> (Array, a)
run2 k = runST (do
(marr,b) <- k
arr <- unsafeFreeze marr
return (arr,b))
{-# INLINE run2 #-}
-- | Copy some elements of a mutable array.
copyM :: MArray s -- ^ Destination
-> Int -- ^ Destination offset
-> MArray s -- ^ Source
-> Int -- ^ Source offset
-> Int -- ^ Count
-> ST s ()
copyM dest didx src sidx count
| count <= 0 = return ()
| otherwise =
#if defined(ASSERTS)
assert (sidx + count <= length src) .
assert (didx + count <= length dest) .
#endif
unsafeIOToST $ memcpyM (maBA dest) (fromIntegral didx)
(maBA src) (fromIntegral sidx)
(fromIntegral count)
{-# INLINE copyM #-}
-- | Copy some elements of an immutable array.
copyI :: MArray s -- ^ Destination
-> Int -- ^ Destination offset
-> Array -- ^ Source
-> Int -- ^ Source offset
-> Int -- ^ First offset in destination /not/ to
-- copy (i.e. /not/ length)
-> ST s ()
copyI dest i0 src j0 top
| i0 >= top = return ()
| otherwise = unsafeIOToST $
memcpyI (maBA dest) (fromIntegral i0)
(aBA src) (fromIntegral j0)
(fromIntegral (top-i0))
{-# INLINE copyI #-}
-- | Compare portions of two arrays for equality. No bounds checking
-- is performed.
equal :: Array -- ^ First
-> Int -- ^ Offset into first
-> Array -- ^ Second
-> Int -- ^ Offset into second
-> Int -- ^ Count
-> Bool
equal arrA offA arrB offB count = inlinePerformIO $ do
i <- memcmp (aBA arrA) (fromIntegral offA)
(aBA arrB) (fromIntegral offB) (fromIntegral count)
return $! i == 0
{-# INLINE equal #-}
foreign import ccall unsafe "_hs_text_memcpy" memcpyI
:: MutableByteArray# s -> CSize -> ByteArray# -> CSize -> CSize -> IO ()
foreign import ccall unsafe "_hs_text_memcmp" memcmp
:: ByteArray# -> CSize -> ByteArray# -> CSize -> CSize -> IO CInt
foreign import ccall unsafe "_hs_text_memcpy" memcpyM
:: MutableByteArray# s -> CSize -> MutableByteArray# s -> CSize -> CSize
-> IO ()

View File

@ -1,535 +0,0 @@
{-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash,
UnliftedFFITypes #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module : Data.Text.Encoding
-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts,
-- (c) 2008, 2009 Tom Harper
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : portable
--
-- Functions for converting 'Text' values to and from 'ByteString',
-- using several standard encodings.
--
-- To gain access to a much larger family of encodings, use the
-- <http://hackage.haskell.org/package/text-icu text-icu package>.
module Data.Text.Encoding
(
-- * Decoding ByteStrings to Text
-- $strict
decodeASCII
, decodeLatin1
, decodeUtf8
, decodeUtf16LE
, decodeUtf16BE
, decodeUtf32LE
, decodeUtf32BE
-- ** Catchable failure
, decodeUtf8'
-- ** Controllable error handling
, decodeUtf8With
, decodeUtf16LEWith
, decodeUtf16BEWith
, decodeUtf32LEWith
, decodeUtf32BEWith
-- ** Stream oriented decoding
-- $stream
, streamDecodeUtf8
, streamDecodeUtf8With
, Decoding(..)
-- * Encoding Text to ByteStrings
, encodeUtf8
, encodeUtf16LE
, encodeUtf16BE
, encodeUtf32LE
, encodeUtf32BE
-- * Encoding Text using ByteString Builders
, encodeUtf8Builder
, encodeUtf8BuilderEscaped
) where
#if MIN_VERSION_base(4,4,0)
import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
#else
import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
#endif
import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
import Control.Monad.ST (runST)
import Data.Bits ((.&.))
import Data.ByteString as B
import Data.ByteString.Internal as B hiding (c2w)
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal (Text(..), safe, text)
import Data.Text.Internal.Private (runText)
import Data.Text.Internal.Unsafe.Char (ord, unsafeWrite)
import Data.Text.Internal.Unsafe.Shift (shiftR)
import Data.Text.Show ()
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Word (Word8, Word32)
#if MIN_VERSION_base(4,5,0)
import Foreign.C.Types (CSize(CSize))
#else
import Foreign.C.Types (CSize)
#endif
import Foreign.ForeignPtr (withForeignPtr)
import Foreign.Marshal.Utils (with)
import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
import Foreign.Storable (Storable, peek, poke)
import GHC.Base (ByteArray#, MutableByteArray#)
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
import qualified Data.ByteString.Builder.Prim as BP
import qualified Data.ByteString.Builder.Prim.Internal as BP
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.Encoding.Fusion as E
import qualified Data.Text.Internal.Encoding.Utf16 as U16
import qualified Data.Text.Internal.Fusion as F
#include "text_cbits.h"
-- $strict
--
-- All of the single-parameter functions for decoding bytestrings
-- encoded in one of the Unicode Transformation Formats (UTF) operate
-- in a /strict/ mode: each will throw an exception if given invalid
-- input.
--
-- Each function has a variant, whose name is suffixed with -'With',
-- that gives greater control over the handling of decoding errors.
-- For instance, 'decodeUtf8' will throw an exception, but
-- 'decodeUtf8With' allows the programmer to determine what to do on a
-- decoding error.
-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
-- encoded text.
decodeASCII :: ByteString -> Text
decodeASCII = decodeUtf8
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
--
-- 'decodeLatin1' is semantically equivalent to
-- @Data.Text.pack . Data.ByteString.Char8.unpack@
decodeLatin1 :: ByteString -> Text
decodeLatin1 (PS fp off len) = text a 0 len
where
a = A.run (A.new len >>= unsafeIOToST . go)
go dest = withForeignPtr fp $ \ptr -> do
c_decode_latin1 (A.maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+len))
return dest
-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
-- __NOTE__: The replacement character returned by 'OnDecodeError'
-- MUST be within the BMP plane; surrogate code points will
-- automatically be remapped to the replacement char @U+FFFD@
-- (/since 0.11.3.0/), whereas code points beyond the BMP will throw an
-- 'error' (/since 1.2.3.1/); For earlier versions of @text@ using
-- those unsupported code points would result in undefined behavior.
decodeUtf8With :: OnDecodeError -> ByteString -> Text
decodeUtf8With onErr (PS fp off len) = runText $ \done -> do
let go dest = withForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr -> do
let end = ptr `plusPtr` (off + len)
loop curPtr = do
curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
if curPtr' == end
then do
n <- peek destOffPtr
unsafeSTToIO (done dest (fromIntegral n))
else do
x <- peek curPtr'
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c
| c > '\xFFFF' -> throwUnsupportedReplChar
| otherwise -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff)
(safe c)
poke destOffPtr (destOff + fromIntegral w)
loop $ curPtr' `plusPtr` 1
loop (ptr `plusPtr` off)
(unsafeIOToST . go) =<< A.new len
where
desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
throwUnsupportedReplChar = throwIO $
ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
-- TODO: The code currently assumes that the transcoded UTF-16
-- stream is at most twice as long (in bytes) as the input UTF-8
-- stream. To justify this assumption one has to assume that the
-- error handler replacement character also satisfies this
-- invariant, by emitting at most one UTF16 code unit.
--
-- One easy way to support the full range of code-points for
-- replacement characters in the error handler is to simply change
-- the (over-)allocation to `A.new (2*len)` and then shrink back the
-- `ByteArray#` to the real size (recent GHCs have a cheap
-- `ByteArray#` resize-primop for that which allow the GC to reclaim
-- the overallocation). However, this would require 4 times as much
-- (temporary) storage as the original UTF-8 required.
--
-- Another strategy would be to optimistically assume that
-- replacement characters are within the BMP, and if the case of a
-- non-BMP replacement occurs reallocate the target buffer (or throw
-- an exception, and fallback to a pessimistic codepath, like e.g.
-- `decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)`)
--
-- Alternatively, `OnDecodeError` could become a datastructure which
-- statically encodes the replacement-character range,
-- e.g. something isomorphic to
--
-- Either (... -> Maybe Word16) (... -> Maybe Char)
--
-- And allow to statically switch between the BMP/non-BMP
-- replacement-character codepaths. There's multiple ways to address
-- this with different tradeoffs; but ideally we should optimise for
-- the optimistic/error-free case.
{- INLINE[0] decodeUtf8With #-}
-- $stream
--
-- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept
-- a 'ByteString' that represents a possibly incomplete input (e.g. a
-- packet from a network stream) that may not end on a UTF-8 boundary.
--
-- 1. The maximal prefix of 'Text' that could be decoded from the
-- given input.
--
-- 2. The suffix of the 'ByteString' that could not be decoded due to
-- insufficient input.
--
-- 3. A function that accepts another 'ByteString'. That string will
-- be assumed to directly follow the string that was passed as
-- input to the original function, and it will in turn be decoded.
--
-- To help understand the use of these functions, consider the Unicode
-- string @\"hi &#9731;\"@. If encoded as UTF-8, this becomes @\"hi
-- \\xe2\\x98\\x83\"@; the final @\'&#9731;\'@ is encoded as 3 bytes.
--
-- Now suppose that we receive this encoded string as 3 packets that
-- are split up on untidy boundaries: @[\"hi \\xe2\", \"\\x98\",
-- \"\\x83\"]@. We cannot decode the entire Unicode string until we
-- have received all three packets, but we would like to make progress
-- as we receive each one.
--
-- @
-- ghci> let s0\@('Some' _ _ f0) = 'streamDecodeUtf8' \"hi \\xe2\"
-- ghci> s0
-- 'Some' \"hi \" \"\\xe2\" _
-- @
--
-- We use the continuation @f0@ to decode our second packet.
--
-- @
-- ghci> let s1\@('Some' _ _ f1) = f0 \"\\x98\"
-- ghci> s1
-- 'Some' \"\" \"\\xe2\\x98\"
-- @
--
-- We could not give @f0@ enough input to decode anything, so it
-- returned an empty string. Once we feed our second continuation @f1@
-- the last byte of input, it will make progress.
--
-- @
-- ghci> let s2\@('Some' _ _ f2) = f1 \"\\x83\"
-- ghci> s2
-- 'Some' \"\\x2603\" \"\" _
-- @
--
-- If given invalid input, an exception will be thrown by the function
-- or continuation where it is encountered.
-- | A stream oriented decoding result.
--
-- @since 1.0.0.0
data Decoding = Some Text ByteString (ByteString -> Decoding)
instance Show Decoding where
showsPrec d (Some t bs _) = showParen (d > prec) $
showString "Some " . showsPrec prec' t .
showChar ' ' . showsPrec prec' bs .
showString " _"
where prec = 10; prec' = prec + 1
newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- encoded text that is known to be valid.
--
-- If the input contains any invalid UTF-8 data, an exception will be
-- thrown (either by this function or a continuation) that cannot be
-- caught in pure code. For more control over the handling of invalid
-- data, use 'streamDecodeUtf8With'.
--
-- @since 1.0.0.0
streamDecodeUtf8 :: ByteString -> Decoding
streamDecodeUtf8 = streamDecodeUtf8With strictDecode
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- encoded text.
--
-- @since 1.0.0.0
streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
where
-- We create a slightly larger than necessary buffer to accommodate a
-- potential surrogate pair started in the last buffer
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
-> Decoding
decodeChunk undecoded0 codepoint0 state0 bs@(PS fp off len) =
runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
where
decodeChunkToBuffer :: A.MArray s -> IO Decoding
decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr ->
with codepoint0 $ \codepointPtr ->
with state0 $ \statePtr ->
with nullPtr $ \curPtrPtr ->
let end = ptr `plusPtr` (off + len)
loop curPtr = do
poke curPtrPtr curPtr
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
curPtrPtr end codepointPtr statePtr
state <- peek statePtr
case state of
UTF8_REJECT -> do
-- We encountered an encoding error
x <- peek curPtr'
poke statePtr 0
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
loop $ curPtr' `plusPtr` 1
_ -> do
-- We encountered the end of the buffer while decoding
n <- peek destOffPtr
codepoint <- peek codepointPtr
chunkText <- unsafeSTToIO $ do
arr <- A.unsafeFreeze dest
return $! text arr 0 (fromIntegral n)
lastPtr <- peek curPtrPtr
let left = lastPtr `minusPtr` curPtr
!undecoded = case state of
UTF8_ACCEPT -> B.empty
_ -> B.append undecoded0 (B.drop left bs)
return $ Some chunkText undecoded
(decodeChunk undecoded codepoint state)
in loop (ptr `plusPtr` off)
desc = "Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
-- to be valid.
--
-- If the input contains any invalid UTF-8 data, an exception will be
-- thrown that cannot be caught in pure code. For more control over
-- the handling of invalid data, use 'decodeUtf8'' or
-- 'decodeUtf8With'.
decodeUtf8 :: ByteString -> Text
decodeUtf8 = decodeUtf8With strictDecode
{-# INLINE[0] decodeUtf8 #-}
{-# RULES "STREAM stream/decodeUtf8 fusion" [1]
forall bs. F.stream (decodeUtf8 bs) = E.streamUtf8 strictDecode bs #-}
-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
-- If the input contains any invalid UTF-8 data, the relevant
-- exception will be returned, otherwise the decoded text.
decodeUtf8' :: ByteString -> Either UnicodeException Text
decodeUtf8' = unsafeDupablePerformIO . try . evaluate . decodeUtf8With strictDecode
{-# INLINE decodeUtf8' #-}
-- | Encode text to a ByteString 'B.Builder' using UTF-8 encoding.
--
-- @since 1.1.0.0
encodeUtf8Builder :: Text -> B.Builder
encodeUtf8Builder = encodeUtf8BuilderEscaped (BP.liftFixedToBounded BP.word8)
-- | Encode text using UTF-8 encoding and escape the ASCII characters using
-- a 'BP.BoundedPrim'.
--
-- Use this function is to implement efficient encoders for text-based formats
-- like JSON or HTML.
--
-- @since 1.1.0.0
{-# INLINE encodeUtf8BuilderEscaped #-}
-- TODO: Extend documentation with references to source code in @blaze-html@
-- or @aeson@ that uses this function.
encodeUtf8BuilderEscaped :: BP.BoundedPrim Word8 -> Text -> B.Builder
encodeUtf8BuilderEscaped be =
-- manual eta-expansion to ensure inlining works as expected
\txt -> B.builder (mkBuildstep txt)
where
bound = max 4 $ BP.sizeBound be
mkBuildstep (Text arr off len) !k =
outerLoop off
where
iend = off + len
outerLoop !i0 !br@(B.BufferRange op0 ope)
| i0 >= iend = k br
| outRemaining > 0 = goPartial (i0 + min outRemaining inpRemaining)
-- TODO: Use a loop with an integrated bound's check if outRemaining
-- is smaller than 8, as this will save on divisions.
| otherwise = return $ B.bufferFull bound op0 (outerLoop i0)
where
outRemaining = (ope `minusPtr` op0) `div` bound
inpRemaining = iend - i0
goPartial !iendTmp = go i0 op0
where
go !i !op
| i < iendTmp = case A.unsafeIndex arr i of
w | w <= 0x7F -> do
BP.runB be (fromIntegral w) op >>= go (i + 1)
| w <= 0x7FF -> do
poke8 0 $ (w `shiftR` 6) + 0xC0
poke8 1 $ (w .&. 0x3f) + 0x80
go (i + 1) (op `plusPtr` 2)
| 0xD800 <= w && w <= 0xDBFF -> do
let c = ord $ U16.chr2 w (A.unsafeIndex arr (i+1))
poke8 0 $ (c `shiftR` 18) + 0xF0
poke8 1 $ ((c `shiftR` 12) .&. 0x3F) + 0x80
poke8 2 $ ((c `shiftR` 6) .&. 0x3F) + 0x80
poke8 3 $ (c .&. 0x3F) + 0x80
go (i + 2) (op `plusPtr` 4)
| otherwise -> do
poke8 0 $ (w `shiftR` 12) + 0xE0
poke8 1 $ ((w `shiftR` 6) .&. 0x3F) + 0x80
poke8 2 $ (w .&. 0x3F) + 0x80
go (i + 1) (op `plusPtr` 3)
| otherwise =
outerLoop i (B.BufferRange op ope)
where
poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8)
-- | Encode text using UTF-8 encoding.
encodeUtf8 :: Text -> ByteString
encodeUtf8 (Text arr off len)
| len == 0 = B.empty
| otherwise = unsafeDupablePerformIO $ do
fp <- mallocByteString (len*3) -- see https://github.com/haskell/text/issues/194 for why len*3 is enough
withForeignPtr fp $ \ptr ->
with ptr $ \destPtr -> do
c_encode_utf8 destPtr (A.aBA arr) (fromIntegral off) (fromIntegral len)
newDest <- peek destPtr
let utf8len = newDest `minusPtr` ptr
if utf8len >= len `shiftR` 1
then return (PS fp 0 utf8len)
else do
fp' <- mallocByteString utf8len
withForeignPtr fp' $ \ptr' -> do
memcpy ptr' ptr (fromIntegral utf8len)
return (PS fp' 0 utf8len)
-- | Decode text from little endian UTF-16 encoding.
decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
{-# INLINE decodeUtf16LEWith #-}
-- | Decode text from little endian UTF-16 encoding.
--
-- If the input contains any invalid little endian UTF-16 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf16LEWith'.
decodeUtf16LE :: ByteString -> Text
decodeUtf16LE = decodeUtf16LEWith strictDecode
{-# INLINE decodeUtf16LE #-}
-- | Decode text from big endian UTF-16 encoding.
decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
{-# INLINE decodeUtf16BEWith #-}
-- | Decode text from big endian UTF-16 encoding.
--
-- If the input contains any invalid big endian UTF-16 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf16BEWith'.
decodeUtf16BE :: ByteString -> Text
decodeUtf16BE = decodeUtf16BEWith strictDecode
{-# INLINE decodeUtf16BE #-}
-- | Encode text using little endian UTF-16 encoding.
encodeUtf16LE :: Text -> ByteString
encodeUtf16LE txt = E.unstream (E.restreamUtf16LE (F.stream txt))
{-# INLINE encodeUtf16LE #-}
-- | Encode text using big endian UTF-16 encoding.
encodeUtf16BE :: Text -> ByteString
encodeUtf16BE txt = E.unstream (E.restreamUtf16BE (F.stream txt))
{-# INLINE encodeUtf16BE #-}
-- | Decode text from little endian UTF-32 encoding.
decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
{-# INLINE decodeUtf32LEWith #-}
-- | Decode text from little endian UTF-32 encoding.
--
-- If the input contains any invalid little endian UTF-32 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf32LEWith'.
decodeUtf32LE :: ByteString -> Text
decodeUtf32LE = decodeUtf32LEWith strictDecode
{-# INLINE decodeUtf32LE #-}
-- | Decode text from big endian UTF-32 encoding.
decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
{-# INLINE decodeUtf32BEWith #-}
-- | Decode text from big endian UTF-32 encoding.
--
-- If the input contains any invalid big endian UTF-32 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf32BEWith'.
decodeUtf32BE :: ByteString -> Text
decodeUtf32BE = decodeUtf32BEWith strictDecode
{-# INLINE decodeUtf32BE #-}
-- | Encode text using little endian UTF-32 encoding.
encodeUtf32LE :: Text -> ByteString
encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
{-# INLINE encodeUtf32LE #-}
-- | Encode text using big endian UTF-32 encoding.
encodeUtf32BE :: Text -> ByteString
encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
{-# INLINE encodeUtf32BE #-}
foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
:: MutableByteArray# s -> Ptr CSize
-> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_state
:: MutableByteArray# s -> Ptr CSize
-> Ptr (Ptr Word8) -> Ptr Word8
-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
:: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO ()
foreign import ccall unsafe "_hs_text_encode_utf8" c_encode_utf8
:: Ptr (Ptr Word8) -> ByteArray# -> CSize -> CSize -> IO ()

View File

@ -1,124 +0,0 @@
{-# LANGUAGE CPP, DeriveDataTypeable #-}
#if __GLASGOW_HASKELL__ >= 704
{-# LANGUAGE Safe #-}
#elif __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module : Data.Text.Encoding.Error
-- Copyright : (c) Bryan O'Sullivan 2009
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : GHC
--
-- Types and functions for dealing with encoding and decoding errors
-- in Unicode text.
--
-- The standard functions for encoding and decoding text are strict,
-- which is to say that they throw exceptions on invalid input. This
-- is often unhelpful on real world input, so alternative functions
-- exist that accept custom handlers for dealing with invalid inputs.
-- These 'OnError' handlers are normal Haskell functions. You can use
-- one of the presupplied functions in this module, or you can write a
-- custom handler of your own.
module Data.Text.Encoding.Error
(
-- * Error handling types
UnicodeException(..)
, OnError
, OnDecodeError
, OnEncodeError
-- * Useful error handling functions
, lenientDecode
, strictDecode
, strictEncode
, ignore
, replace
) where
import Control.DeepSeq (NFData (..))
import Control.Exception (Exception, throw)
import Data.Typeable (Typeable)
import Data.Word (Word8)
import Numeric (showHex)
-- | Function type for handling a coding error. It is supplied with
-- two inputs:
--
-- * A 'String' that describes the error.
--
-- * The input value that caused the error. If the error arose
-- because the end of input was reached or could not be identified
-- precisely, this value will be 'Nothing'.
--
-- If the handler returns a value wrapped with 'Just', that value will
-- be used in the output as the replacement for the invalid input. If
-- it returns 'Nothing', no value will be used in the output.
--
-- Should the handler need to abort processing, it should use 'error'
-- or 'throw' an exception (preferably a 'UnicodeException'). It may
-- use the description provided to construct a more helpful error
-- report.
type OnError a b = String -> Maybe a -> Maybe b
-- | A handler for a decoding error.
type OnDecodeError = OnError Word8 Char
-- | A handler for an encoding error.
{-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-}
type OnEncodeError = OnError Char Word8
-- | An exception type for representing Unicode encoding errors.
data UnicodeException =
DecodeError String (Maybe Word8)
-- ^ Could not decode a byte sequence because it was invalid under
-- the given encoding, or ran out of input in mid-decode.
| EncodeError String (Maybe Char)
-- ^ Tried to encode a character that could not be represented
-- under the given encoding, or ran out of input in mid-encode.
deriving (Eq, Typeable)
{-# DEPRECATED EncodeError "This constructor is never used, and will be removed." #-}
showUnicodeException :: UnicodeException -> String
showUnicodeException (DecodeError desc (Just w))
= "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc)
showUnicodeException (DecodeError desc Nothing)
= "Cannot decode input: " ++ desc
showUnicodeException (EncodeError desc (Just c))
= "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc)
showUnicodeException (EncodeError desc Nothing)
= "Cannot encode input: " ++ desc
instance Show UnicodeException where
show = showUnicodeException
instance Exception UnicodeException
instance NFData UnicodeException where
rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` ()
rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` ()
-- | Throw a 'UnicodeException' if decoding fails.
strictDecode :: OnDecodeError
strictDecode desc c = throw (DecodeError desc c)
-- | Replace an invalid input byte with the Unicode replacement
-- character U+FFFD.
lenientDecode :: OnDecodeError
lenientDecode _ _ = Just '\xfffd'
-- | Throw a 'UnicodeException' if encoding fails.
{-# DEPRECATED strictEncode "This function always throws an exception, and will be removed." #-}
strictEncode :: OnEncodeError
strictEncode desc c = throw (EncodeError desc c)
-- | Ignore an invalid input, substituting nothing in the output.
ignore :: OnError a b
ignore _ _ = Nothing
-- | Replace an invalid input with a valid output.
replace :: b -> OnError a b
replace c _ _ = Just c

View File

@ -1,176 +0,0 @@
{-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving #-}
-- |
-- Module : Data.Text.Foreign
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : GHC
--
-- Support for using 'Text' data with native code via the Haskell
-- foreign function interface.
module Data.Text.Foreign
(
-- * Interoperability with native code
-- $interop
I16
-- * Safe conversion functions
, fromPtr
, useAsPtr
, asForeignPtr
-- ** Encoding as UTF-8
, peekCStringLen
, withCStringLen
-- * Unsafe conversion code
, lengthWord16
, unsafeCopyToPtr
-- * Low-level manipulation
-- $lowlevel
, dropWord16
, takeWord16
) where
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
#if MIN_VERSION_base(4,4,0)
import Control.Monad.ST.Unsafe (unsafeIOToST)
#else
import Control.Monad.ST (unsafeIOToST)
#endif
import Data.ByteString.Unsafe (unsafePackCStringLen, unsafeUseAsCStringLen)
import Data.Text.Encoding (decodeUtf8, encodeUtf8)
import Data.Text.Internal (Text(..), empty)
import Data.Text.Unsafe (lengthWord16)
import Data.Word (Word16)
import Foreign.C.String (CStringLen)
import Foreign.ForeignPtr (ForeignPtr, mallocForeignPtrArray, withForeignPtr)
import Foreign.Marshal.Alloc (allocaBytes)
import Foreign.Ptr (Ptr, castPtr, plusPtr)
import Foreign.Storable (peek, poke)
import qualified Data.Text.Array as A
-- $interop
--
-- The 'Text' type is implemented using arrays that are not guaranteed
-- to have a fixed address in the Haskell heap. All communication with
-- native code must thus occur by copying data back and forth.
--
-- The 'Text' type's internal representation is UTF-16, using the
-- platform's native endianness. This makes copied data suitable for
-- use with native libraries that use a similar representation, such
-- as ICU. To interoperate with native libraries that use different
-- internal representations, such as UTF-8 or UTF-32, consider using
-- the functions in the 'Data.Text.Encoding' module.
-- | A type representing a number of UTF-16 code units.
newtype I16 = I16 Int
deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show)
-- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word16' by copying the
-- contents of the array.
fromPtr :: Ptr Word16 -- ^ source array
-> I16 -- ^ length of source array (in 'Word16' units)
-> IO Text
fromPtr _ (I16 0) = return empty
fromPtr ptr (I16 len) =
#if defined(ASSERTS)
assert (len > 0) $
#endif
return $! Text arr 0 len
where
arr = A.run (A.new len >>= copy)
copy marr = loop ptr 0
where
loop !p !i | i == len = return marr
| otherwise = do
A.unsafeWrite marr i =<< unsafeIOToST (peek p)
loop (p `plusPtr` 2) (i + 1)
-- $lowlevel
--
-- Foreign functions that use UTF-16 internally may return indices in
-- units of 'Word16' instead of characters. These functions may
-- safely be used with such indices, as they will adjust offsets if
-- necessary to preserve the validity of a Unicode string.
-- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word16' units in
-- length.
--
-- If @n@ would cause the 'Text' to end inside a surrogate pair, the
-- end of the prefix will be advanced by one additional 'Word16' unit
-- to maintain its validity.
takeWord16 :: I16 -> Text -> Text
takeWord16 (I16 n) t@(Text arr off len)
| n <= 0 = empty
| n >= len || m >= len = t
| otherwise = Text arr off m
where
m | w < 0xD800 || w > 0xDBFF = n
| otherwise = n+1
w = A.unsafeIndex arr (off+n-1)
-- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word16' units
-- dropped from its beginning.
--
-- If @n@ would cause the 'Text' to begin inside a surrogate pair, the
-- beginning of the suffix will be advanced by one additional 'Word16'
-- unit to maintain its validity.
dropWord16 :: I16 -> Text -> Text
dropWord16 (I16 n) t@(Text arr off len)
| n <= 0 = t
| n >= len || m >= len = empty
| otherwise = Text arr (off+m) (len-m)
where
m | w < 0xD800 || w > 0xDBFF = n
| otherwise = n+1
w = A.unsafeIndex arr (off+n-1)
-- | /O(n)/ Copy a 'Text' to an array. The array is assumed to be big
-- enough to hold the contents of the entire 'Text'.
unsafeCopyToPtr :: Text -> Ptr Word16 -> IO ()
unsafeCopyToPtr (Text arr off len) ptr = loop ptr off
where
end = off + len
loop !p !i | i == end = return ()
| otherwise = do
poke p (A.unsafeIndex arr i)
loop (p `plusPtr` 2) (i + 1)
-- | /O(n)/ Perform an action on a temporary, mutable copy of a
-- 'Text'. The copy is freed as soon as the action returns.
useAsPtr :: Text -> (Ptr Word16 -> I16 -> IO a) -> IO a
useAsPtr t@(Text _arr _off len) action =
allocaBytes (len * 2) $ \buf -> do
unsafeCopyToPtr t buf
action (castPtr buf) (fromIntegral len)
-- | /O(n)/ Make a mutable copy of a 'Text'.
asForeignPtr :: Text -> IO (ForeignPtr Word16, I16)
asForeignPtr t@(Text _arr _off len) = do
fp <- mallocForeignPtrArray len
withForeignPtr fp $ unsafeCopyToPtr t
return (fp, I16 len)
-- | /O(n)/ Decode a C string with explicit length, which is assumed
-- to have been encoded as UTF-8. If decoding fails, a
-- 'UnicodeException' is thrown.
--
-- @since 1.0.0.0
peekCStringLen :: CStringLen -> IO Text
peekCStringLen cs = do
bs <- unsafePackCStringLen cs
return $! decodeUtf8 bs
-- | Marshal a 'Text' into a C string encoded as UTF-8 in temporary
-- storage, with explicit length information. The encoded string may
-- contain NUL bytes, and is not followed by a trailing NUL byte.
--
-- The temporary storage is freed when the subcomputation terminates
-- (either normally or via an exception), so the pointer to the
-- temporary storage must /not/ be used after this function returns.
--
-- @since 1.0.0.0
withCStringLen :: Text -> (CStringLen -> IO a) -> IO a
withCStringLen t act = unsafeUseAsCStringLen (encodeUtf8 t) act

View File

@ -1,350 +0,0 @@
{-# LANGUAGE BangPatterns, CPP, RecordWildCards, ScopedTypeVariables #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module : Data.Text.IO
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan,
-- (c) 2009 Simon Marlow
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : GHC
--
-- Efficient locale-sensitive support for text I\/O.
--
-- Skip past the synopsis for some important notes on performance and
-- portability across different versions of GHC.
module Data.Text.IO
(
-- * Performance
-- $performance
-- * Locale support
-- $locale
-- * File-at-a-time operations
readFile
, writeFile
, appendFile
-- * Operations on handles
, hGetContents
, hGetChunk
, hGetLine
, hPutStr
, hPutStrLn
-- * Special cases for standard input and output
, interact
, getContents
, getLine
, putStr
, putStrLn
) where
import Data.Text (Text)
import Prelude hiding (appendFile, getContents, getLine, interact,
putStr, putStrLn, readFile, writeFile)
import System.IO (Handle, IOMode(..), hPutChar, openFile, stdin, stdout,
withFile)
import qualified Control.Exception as E
import Control.Monad (liftM2, when)
import Data.IORef (readIORef, writeIORef)
import qualified Data.Text as T
import Data.Text.Internal.Fusion (stream)
import Data.Text.Internal.Fusion.Types (Step(..), Stream(..))
import Data.Text.Internal.IO (hGetLineWith, readChunk)
import GHC.IO.Buffer (Buffer(..), BufferState(..), CharBufElem, CharBuffer,
RawCharBuffer, emptyBuffer, isEmptyBuffer, newCharBuffer,
writeCharBuf)
import GHC.IO.Exception (IOException(ioe_type), IOErrorType(InappropriateType))
import GHC.IO.Handle.Internals (augmentIOError, hClose_help, wantReadableHandle,
wantWritableHandle)
import GHC.IO.Handle.Text (commitBuffer')
import GHC.IO.Handle.Types (BufferList(..), BufferMode(..), Handle__(..),
HandleType(..), Newline(..))
import System.IO (hGetBuffering, hFileSize, hSetBuffering, hTell)
import System.IO.Error (isEOFError)
-- $performance
-- #performance#
--
-- The functions in this module obey the runtime system's locale,
-- character set encoding, and line ending conversion settings.
--
-- If you know in advance that you will be working with data that has
-- a specific encoding (e.g. UTF-8), and your application is highly
-- performance sensitive, you may find that it is faster to perform
-- I\/O with bytestrings and to encode and decode yourself than to use
-- the functions in this module.
--
-- Whether this will hold depends on the version of GHC you are using,
-- the platform you are working on, the data you are working with, and
-- the encodings you are using, so be sure to test for yourself.
-- | The 'readFile' function reads a file and returns the contents of
-- the file as a string. The entire file is read strictly, as with
-- 'getContents'.
readFile :: FilePath -> IO Text
readFile name = openFile name ReadMode >>= hGetContents
-- | Write a string to a file. The file is truncated to zero length
-- before writing begins.
writeFile :: FilePath -> Text -> IO ()
writeFile p = withFile p WriteMode . flip hPutStr
-- | Write a string the end of a file.
appendFile :: FilePath -> Text -> IO ()
appendFile p = withFile p AppendMode . flip hPutStr
catchError :: String -> Handle -> Handle__ -> IOError -> IO (Text, Bool)
catchError caller h Handle__{..} err
| isEOFError err = do
buf <- readIORef haCharBuffer
return $ if isEmptyBuffer buf
then (T.empty, True)
else (T.singleton '\r', True)
| otherwise = E.throwIO (augmentIOError err caller h)
-- | Wrap readChunk and return a value indicating if we're reached the EOF.
-- This is needed because unpack_nl is unable to discern the difference
-- between a buffer with just \r due to EOF or because not enough data was left
-- for decoding. e.g. the final character decoded from the byte buffer was \r.
readChunkEof :: Handle__ -> CharBuffer -> IO (Text, Bool)
readChunkEof hh buf = do t <- readChunk hh buf
return (t, False)
-- | /Experimental./ Read a single chunk of strict text from a
-- 'Handle'. The size of the chunk depends on the amount of input
-- currently buffered.
--
-- This function blocks only if there is no data available, and EOF
-- has not yet been reached. Once EOF is reached, this function
-- returns an empty string instead of throwing an exception.
hGetChunk :: Handle -> IO Text
hGetChunk h = wantReadableHandle "hGetChunk" h readSingleChunk
where
readSingleChunk hh@Handle__{..} = do
buf <- readIORef haCharBuffer
(t, _) <- readChunkEof hh buf `E.catch` catchError "hGetChunk" h hh
return (hh, t)
-- | Read the remaining contents of a 'Handle' as a string. The
-- 'Handle' is closed once the contents have been read, or if an
-- exception is thrown.
--
-- Internally, this function reads a chunk at a time from the
-- lower-level buffering abstraction, and concatenates the chunks into
-- a single string once the entire file has been read.
--
-- As a result, it requires approximately twice as much memory as its
-- result to construct its result. For files more than a half of
-- available RAM in size, this may result in memory exhaustion.
hGetContents :: Handle -> IO Text
hGetContents h = do
chooseGoodBuffering h
wantReadableHandle "hGetContents" h readAll
where
readAll hh@Handle__{..} = do
let readChunks = do
buf <- readIORef haCharBuffer
(t, eof) <- readChunkEof hh buf
`E.catch` catchError "hGetContents" h hh
if eof
then return [t]
else (t:) `fmap` readChunks
ts <- readChunks
(hh', _) <- hClose_help hh
return (hh'{haType=ClosedHandle}, T.concat ts)
-- | Use a more efficient buffer size if we're reading in
-- block-buffered mode with the default buffer size. When we can
-- determine the size of the handle we're reading, set the buffer size
-- to that, so that we can read the entire file in one chunk.
-- Otherwise, use a buffer size of at least 16KB.
chooseGoodBuffering :: Handle -> IO ()
chooseGoodBuffering h = do
bufMode <- hGetBuffering h
case bufMode of
BlockBuffering Nothing -> do
d <- E.catch (liftM2 (-) (hFileSize h) (hTell h)) $ \(e::IOException) ->
if ioe_type e == InappropriateType
then return 16384 -- faster than the 2KB default
else E.throwIO e
when (d > 0) . hSetBuffering h . BlockBuffering . Just . fromIntegral $ d
_ -> return ()
-- | Read a single line from a handle.
hGetLine :: Handle -> IO Text
hGetLine = hGetLineWith T.concat
-- | Write a string to a handle.
hPutStr :: Handle -> Text -> IO ()
-- This function is lifted almost verbatim from GHC.IO.Handle.Text.
hPutStr h t = do
(buffer_mode, nl) <-
wantWritableHandle "hPutStr" h $ \h_ -> do
bmode <- getSpareBuffer h_
return (bmode, haOutputNL h_)
let str = stream t
case buffer_mode of
(NoBuffering, _) -> hPutChars h str
(LineBuffering, buf) -> writeLines h nl buf str
(BlockBuffering _, buf)
| nl == CRLF -> writeBlocksCRLF h buf str
| otherwise -> writeBlocksRaw h buf str
hPutChars :: Handle -> Stream Char -> IO ()
hPutChars h (Stream next0 s0 _len) = loop s0
where
loop !s = case next0 s of
Done -> return ()
Skip s' -> loop s'
Yield x s' -> hPutChar h x >> loop s'
-- The following functions are largely lifted from GHC.IO.Handle.Text,
-- but adapted to a coinductive stream of data instead of an inductive
-- list.
--
-- We have several variations of more or less the same code for
-- performance reasons. Splitting the original buffered write
-- function into line- and block-oriented versions gave us a 2.1x
-- performance improvement. Lifting out the raw/cooked newline
-- handling gave a few more percent on top.
writeLines :: Handle -> Newline -> Buffer CharBufElem -> Stream Char -> IO ()
writeLines h nl buf0 (Stream next0 s0 _len) = outer s0 buf0
where
outer s1 Buffer{bufRaw=raw, bufSize=len} = inner s1 (0::Int)
where
inner !s !n =
case next0 s of
Done -> commit n False{-no flush-} True{-release-} >> return ()
Skip s' -> inner s' n
Yield x s'
| n + 1 >= len -> commit n True{-needs flush-} False >>= outer s
| x == '\n' -> do
n' <- if nl == CRLF
then do n1 <- writeCharBuf raw n '\r'
writeCharBuf raw n1 '\n'
else writeCharBuf raw n x
commit n' True{-needs flush-} False >>= outer s'
| otherwise -> writeCharBuf raw n x >>= inner s'
commit = commitBuffer h raw len
writeBlocksCRLF :: Handle -> Buffer CharBufElem -> Stream Char -> IO ()
writeBlocksCRLF h buf0 (Stream next0 s0 _len) = outer s0 buf0
where
outer s1 Buffer{bufRaw=raw, bufSize=len} = inner s1 (0::Int)
where
inner !s !n =
case next0 s of
Done -> commit n False{-no flush-} True{-release-} >> return ()
Skip s' -> inner s' n
Yield x s'
| n + 1 >= len -> commit n True{-needs flush-} False >>= outer s
| x == '\n' -> do n1 <- writeCharBuf raw n '\r'
writeCharBuf raw n1 '\n' >>= inner s'
| otherwise -> writeCharBuf raw n x >>= inner s'
commit = commitBuffer h raw len
writeBlocksRaw :: Handle -> Buffer CharBufElem -> Stream Char -> IO ()
writeBlocksRaw h buf0 (Stream next0 s0 _len) = outer s0 buf0
where
outer s1 Buffer{bufRaw=raw, bufSize=len} = inner s1 (0::Int)
where
inner !s !n =
case next0 s of
Done -> commit n False{-no flush-} True{-release-} >> return ()
Skip s' -> inner s' n
Yield x s'
| n + 1 >= len -> commit n True{-needs flush-} False >>= outer s
| otherwise -> writeCharBuf raw n x >>= inner s'
commit = commitBuffer h raw len
-- This function is completely lifted from GHC.IO.Handle.Text.
getSpareBuffer :: Handle__ -> IO (BufferMode, CharBuffer)
getSpareBuffer Handle__{haCharBuffer=ref,
haBuffers=spare_ref,
haBufferMode=mode}
= do
case mode of
NoBuffering -> return (mode, error "no buffer!")
_ -> do
bufs <- readIORef spare_ref
buf <- readIORef ref
case bufs of
BufferListCons b rest -> do
writeIORef spare_ref rest
return ( mode, emptyBuffer b (bufSize buf) WriteBuffer)
BufferListNil -> do
new_buf <- newCharBuffer (bufSize buf) WriteBuffer
return (mode, new_buf)
-- This function is completely lifted from GHC.IO.Handle.Text.
commitBuffer :: Handle -> RawCharBuffer -> Int -> Int -> Bool -> Bool
-> IO CharBuffer
commitBuffer hdl !raw !sz !count flush release =
wantWritableHandle "commitAndReleaseBuffer" hdl $
commitBuffer' raw sz count flush release
{-# INLINE commitBuffer #-}
-- | Write a string to a handle, followed by a newline.
hPutStrLn :: Handle -> Text -> IO ()
hPutStrLn h t = hPutStr h t >> hPutChar h '\n'
-- | The 'interact' function takes a function of type @Text -> Text@
-- as its argument. The entire input from the standard input device is
-- passed to this function as its argument, and the resulting string
-- is output on the standard output device.
interact :: (Text -> Text) -> IO ()
interact f = putStr . f =<< getContents
-- | Read all user input on 'stdin' as a single string.
getContents :: IO Text
getContents = hGetContents stdin
-- | Read a single line of user input from 'stdin'.
getLine :: IO Text
getLine = hGetLine stdin
-- | Write a string to 'stdout'.
putStr :: Text -> IO ()
putStr = hPutStr stdout
-- | Write a string to 'stdout', followed by a newline.
putStrLn :: Text -> IO ()
putStrLn = hPutStrLn stdout
-- $locale
--
-- /Note/: The behaviour of functions in this module depends on the
-- version of GHC you are using.
--
-- Beginning with GHC 6.12, text I\/O is performed using the system or
-- handle's current locale and line ending conventions.
--
-- Under GHC 6.10 and earlier, the system I\/O libraries do not
-- support locale-sensitive I\/O or line ending conversion. On these
-- versions of GHC, functions in this library all use UTF-8. What
-- does this mean in practice?
--
-- * All data that is read will be decoded as UTF-8.
--
-- * Before data is written, it is first encoded as UTF-8.
--
-- * On both reading and writing, the platform's native newline
-- conversion is performed.
--
-- If you must use a non-UTF-8 locale on an older version of GHC, you
-- will have to perform the transcoding yourself, e.g. as follows:
--
-- > import qualified Data.ByteString as B
-- > import Data.Text (Text)
-- > import Data.Text.Encoding (encodeUtf16)
-- >
-- > putStr_Utf16LE :: Text -> IO ()
-- > putStr_Utf16LE t = B.putStr (encodeUtf16LE t)
--
-- On transcoding errors, an 'IOError' exception is thrown. You can
-- use the API in "Data.Text.Encoding" if you need more control over
-- error handling or transcoding.

View File

@ -1,188 +0,0 @@
{-# LANGUAGE CPP, DeriveDataTypeable, UnboxedTuples #-}
{-# OPTIONS_HADDOCK not-home #-}
-- |
-- Module : Data.Text.Internal
-- Copyright : (c) 2008, 2009 Tom Harper,
-- (c) 2009, 2010 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- A module containing private 'Text' internals. This exposes the
-- 'Text' representation and low level construction functions.
-- Modules which extend the 'Text' system may need to use this module.
--
-- You should not use this module unless you are determined to monkey
-- with the internals, as the functions here do just about nothing to
-- preserve data invariants. You have been warned!
#if defined(__GLASGOW_HASKELL__) && !defined(__HADDOCK__)
#include "MachDeps.h"
#endif
module Data.Text.Internal
(
-- * Types
-- $internals
Text(..)
-- * Construction
, text
, textP
-- * Safety
, safe
-- * Code that must be here for accessibility
, empty
, empty_
-- * Utilities
, firstf
-- * Checked multiplication
, mul
, mul32
, mul64
-- * Debugging
, showText
) where
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
import Data.Bits
import Data.Int (Int32, Int64)
import Data.Text.Internal.Unsafe.Char (ord)
import Data.Typeable (Typeable)
import qualified Data.Text.Array as A
-- | A space efficient, packed, unboxed Unicode text type.
data Text = Text
{-# UNPACK #-} !A.Array -- payload (Word16 elements)
{-# UNPACK #-} !Int -- offset (units of Word16, not Char)
{-# UNPACK #-} !Int -- length (units of Word16, not Char)
deriving (Typeable)
-- | Smart constructor.
text_ :: A.Array -> Int -> Int -> Text
text_ arr off len =
#if defined(ASSERTS)
let c = A.unsafeIndex arr off
alen = A.length arr
in assert (len >= 0) .
assert (off >= 0) .
assert (alen == 0 || len == 0 || off < alen) .
assert (len == 0 || c < 0xDC00 || c > 0xDFFF) $
#endif
Text arr off len
{-# INLINE text_ #-}
-- | /O(1)/ The empty 'Text'.
empty :: Text
empty = Text A.empty 0 0
{-# INLINE [1] empty #-}
-- | A non-inlined version of 'empty'.
empty_ :: Text
empty_ = Text A.empty 0 0
{-# NOINLINE empty_ #-}
-- | Construct a 'Text' without invisibly pinning its byte array in
-- memory if its length has dwindled to zero.
text :: A.Array -> Int -> Int -> Text
text arr off len | len == 0 = empty
| otherwise = text_ arr off len
{-# INLINE text #-}
textP :: A.Array -> Int -> Int -> Text
{-# DEPRECATED textP "Use text instead" #-}
textP = text
-- | A useful 'show'-like function for debugging purposes.
showText :: Text -> String
showText (Text arr off len) =
"Text " ++ show (A.toList arr off len) ++ ' ' :
show off ++ ' ' : show len
-- | Map a 'Char' to a 'Text'-safe value.
--
-- UTF-16 surrogate code points are not included in the set of Unicode
-- scalar values, but are unfortunately admitted as valid 'Char'
-- values by Haskell. They cannot be represented in a 'Text'. This
-- function remaps those code points to the Unicode replacement
-- character (U+FFFD, \'&#xfffd;\'), and leaves other code points
-- unchanged.
safe :: Char -> Char
safe c
| ord c .&. 0x1ff800 /= 0xd800 = c
| otherwise = '\xfffd'
{-# INLINE [0] safe #-}
-- | Apply a function to the first element of an optional pair.
firstf :: (a -> c) -> Maybe (a,b) -> Maybe (c,b)
firstf f (Just (a, b)) = Just (f a, b)
firstf _ Nothing = Nothing
-- | Checked multiplication. Calls 'error' if the result would
-- overflow.
mul :: Int -> Int -> Int
#if WORD_SIZE_IN_BITS == 64
mul a b = fromIntegral $ fromIntegral a `mul64` fromIntegral b
#else
mul a b = fromIntegral $ fromIntegral a `mul32` fromIntegral b
#endif
{-# INLINE mul #-}
infixl 7 `mul`
-- | Checked multiplication. Calls 'error' if the result would
-- overflow.
mul64 :: Int64 -> Int64 -> Int64
mul64 a b
| a >= 0 && b >= 0 = mul64_ a b
| a >= 0 = -mul64_ a (-b)
| b >= 0 = -mul64_ (-a) b
| otherwise = mul64_ (-a) (-b)
{-# INLINE mul64 #-}
infixl 7 `mul64`
mul64_ :: Int64 -> Int64 -> Int64
mul64_ a b
| ahi > 0 && bhi > 0 = error "overflow"
| top > 0x7fffffff = error "overflow"
| total < 0 = error "overflow"
| otherwise = total
where (# ahi, alo #) = (# a `shiftR` 32, a .&. 0xffffffff #)
(# bhi, blo #) = (# b `shiftR` 32, b .&. 0xffffffff #)
top = ahi * blo + alo * bhi
total = (top `shiftL` 32) + alo * blo
{-# INLINE mul64_ #-}
-- | Checked multiplication. Calls 'error' if the result would
-- overflow.
mul32 :: Int32 -> Int32 -> Int32
mul32 a b = case fromIntegral a * fromIntegral b of
ab | ab < min32 || ab > max32 -> error "overflow"
| otherwise -> fromIntegral ab
where min32 = -0x80000000 :: Int64
max32 = 0x7fffffff
{-# INLINE mul32 #-}
infixl 7 `mul32`
-- $internals
--
-- Internally, the 'Text' type is represented as an array of 'Word16'
-- UTF-16 code units. The offset and length fields in the constructor
-- are in these units, /not/ units of 'Char'.
--
-- Invariants that all functions must maintain:
--
-- * Since the 'Text' type uses UTF-16 internally, it cannot represent
-- characters in the reserved surrogate code point range U+D800 to
-- U+DFFF. To maintain this invariant, the 'safe' function maps
-- 'Char' values in this range to the replacement character (U+FFFD,
-- \'&#xfffd;\').
--
-- * A leading (or \"high\") surrogate code unit (0xD8000xDBFF) must
-- always be followed by a trailing (or \"low\") surrogate code unit
-- (0xDC00-0xDFFF). A trailing surrogate code unit must always be
-- preceded by a leading surrogate code unit.

View File

@ -1,329 +0,0 @@
{-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
{-# OPTIONS_HADDOCK not-home #-}
-----------------------------------------------------------------------------
-- |
-- Module : Data.Text.Internal.Builder
-- Copyright : (c) 2013 Bryan O'Sullivan
-- (c) 2010 Johan Tibell
-- License : BSD-style (see LICENSE)
--
-- Maintainer : Johan Tibell <johan.tibell@gmail.com>
-- Stability : experimental
-- Portability : portable to Hugs and GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Efficient construction of lazy @Text@ values. The principal
-- operations on a @Builder@ are @singleton@, @fromText@, and
-- @fromLazyText@, which construct new builders, and 'mappend', which
-- concatenates two builders.
--
-- To get maximum performance when building lazy @Text@ values using a
-- builder, associate @mappend@ calls to the right. For example,
-- prefer
--
-- > singleton 'a' `mappend` (singleton 'b' `mappend` singleton 'c')
--
-- to
--
-- > singleton 'a' `mappend` singleton 'b' `mappend` singleton 'c'
--
-- as the latter associates @mappend@ to the left.
--
-----------------------------------------------------------------------------
module Data.Text.Internal.Builder
( -- * Public API
-- ** The Builder type
Builder
, toLazyText
, toLazyTextWith
-- ** Constructing Builders
, singleton
, fromText
, fromLazyText
, fromString
-- ** Flushing the buffer state
, flush
-- * Internal functions
, append'
, ensureFree
, writeN
) where
import Control.Monad.ST (ST, runST)
import Data.Monoid (Monoid(..))
#if !MIN_VERSION_base(4,11,0) && MIN_VERSION_base(4,9,0)
import Data.Semigroup (Semigroup(..))
#endif
import Data.Text.Internal (Text(..))
import Data.Text.Internal.Lazy (smallChunkSize)
import Data.Text.Unsafe (inlineInterleaveST)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Prelude hiding (map, putChar)
import qualified Data.String as String
import qualified Data.Text as S
import qualified Data.Text.Array as A
import qualified Data.Text.Lazy as L
------------------------------------------------------------------------
-- | A @Builder@ is an efficient way to build lazy @Text@ values.
-- There are several functions for constructing builders, but only one
-- to inspect them: to extract any data, you have to turn them into
-- lazy @Text@ values using @toLazyText@.
--
-- Internally, a builder constructs a lazy @Text@ by filling arrays
-- piece by piece. As each buffer is filled, it is \'popped\' off, to
-- become a new chunk of the resulting lazy @Text@. All this is
-- hidden from the user of the @Builder@.
newtype Builder = Builder {
-- Invariant (from Data.Text.Lazy):
-- The lists include no null Texts.
runBuilder :: forall s. (Buffer s -> ST s [S.Text])
-> Buffer s
-> ST s [S.Text]
}
#if MIN_VERSION_base(4,9,0)
instance Semigroup Builder where
(<>) = append
{-# INLINE (<>) #-}
#endif
instance Monoid Builder where
mempty = empty
{-# INLINE mempty #-}
#if MIN_VERSION_base(4,9,0)
mappend = (<>) -- future-proof definition
#else
mappend = append
#endif
{-# INLINE mappend #-}
mconcat = foldr mappend Data.Monoid.mempty
{-# INLINE mconcat #-}
instance String.IsString Builder where
fromString = fromString
{-# INLINE fromString #-}
instance Show Builder where
show = show . toLazyText
instance Eq Builder where
a == b = toLazyText a == toLazyText b
instance Ord Builder where
a <= b = toLazyText a <= toLazyText b
------------------------------------------------------------------------
-- | /O(1)./ The empty @Builder@, satisfying
--
-- * @'toLazyText' 'empty' = 'L.empty'@
--
empty :: Builder
empty = Builder (\ k buf -> k buf)
{-# INLINE empty #-}
-- | /O(1)./ A @Builder@ taking a single character, satisfying
--
-- * @'toLazyText' ('singleton' c) = 'L.singleton' c@
--
singleton :: Char -> Builder
singleton c = writeAtMost 2 $ \ marr o -> unsafeWrite marr o c
{-# INLINE singleton #-}
------------------------------------------------------------------------
-- | /O(1)./ The concatenation of two builders, an associative
-- operation with identity 'empty', satisfying
--
-- * @'toLazyText' ('append' x y) = 'L.append' ('toLazyText' x) ('toLazyText' y)@
--
append :: Builder -> Builder -> Builder
append (Builder f) (Builder g) = Builder (f . g)
{-# INLINE [0] append #-}
-- TODO: Experiment to find the right threshold.
copyLimit :: Int
copyLimit = 128
-- This function attempts to merge small @Text@ values instead of
-- treating each value as its own chunk. We may not always want this.
-- | /O(1)./ A @Builder@ taking a 'S.Text', satisfying
--
-- * @'toLazyText' ('fromText' t) = 'L.fromChunks' [t]@
--
fromText :: S.Text -> Builder
fromText t@(Text arr off l)
| S.null t = empty
| l <= copyLimit = writeN l $ \marr o -> A.copyI marr o arr off (l+o)
| otherwise = flush `append` mapBuilder (t :)
{-# INLINE [1] fromText #-}
{-# RULES
"fromText/pack" forall s .
fromText (S.pack s) = fromString s
#-}
-- | /O(1)./ A Builder taking a @String@, satisfying
--
-- * @'toLazyText' ('fromString' s) = 'L.fromChunks' [S.pack s]@
--
fromString :: String -> Builder
fromString str = Builder $ \k (Buffer p0 o0 u0 l0) ->
let loop !marr !o !u !l [] = k (Buffer marr o u l)
loop marr o u l s@(c:cs)
| l <= 1 = do
arr <- A.unsafeFreeze marr
let !t = Text arr o u
marr' <- A.new chunkSize
ts <- inlineInterleaveST (loop marr' 0 0 chunkSize s)
return $ t : ts
| otherwise = do
n <- unsafeWrite marr (o+u) c
loop marr o (u+n) (l-n) cs
in loop p0 o0 u0 l0 str
where
chunkSize = smallChunkSize
{-# INLINE fromString #-}
-- | /O(1)./ A @Builder@ taking a lazy @Text@, satisfying
--
-- * @'toLazyText' ('fromLazyText' t) = t@
--
fromLazyText :: L.Text -> Builder
fromLazyText ts = flush `append` mapBuilder (L.toChunks ts ++)
{-# INLINE fromLazyText #-}
------------------------------------------------------------------------
-- Our internal buffer type
data Buffer s = Buffer {-# UNPACK #-} !(A.MArray s)
{-# UNPACK #-} !Int -- offset
{-# UNPACK #-} !Int -- used units
{-# UNPACK #-} !Int -- length left
------------------------------------------------------------------------
-- | /O(n)./ Extract a lazy @Text@ from a @Builder@ with a default
-- buffer size. The construction work takes place if and when the
-- relevant part of the lazy @Text@ is demanded.
toLazyText :: Builder -> L.Text
toLazyText = toLazyTextWith smallChunkSize
-- | /O(n)./ Extract a lazy @Text@ from a @Builder@, using the given
-- size for the initial buffer. The construction work takes place if
-- and when the relevant part of the lazy @Text@ is demanded.
--
-- If the initial buffer is too small to hold all data, subsequent
-- buffers will be the default buffer size.
toLazyTextWith :: Int -> Builder -> L.Text
toLazyTextWith chunkSize m = L.fromChunks (runST $
newBuffer chunkSize >>= runBuilder (m `append` flush) (const (return [])))
-- | /O(1)./ Pop the strict @Text@ we have constructed so far, if any,
-- yielding a new chunk in the result lazy @Text@.
flush :: Builder
flush = Builder $ \ k buf@(Buffer p o u l) ->
if u == 0
then k buf
else do arr <- A.unsafeFreeze p
let !b = Buffer p (o+u) 0 l
!t = Text arr o u
ts <- inlineInterleaveST (k b)
return $! t : ts
{-# INLINE [1] flush #-}
-- defer inlining so that flush/flush rule may fire.
------------------------------------------------------------------------
-- | Sequence an ST operation on the buffer
withBuffer :: (forall s. Buffer s -> ST s (Buffer s)) -> Builder
withBuffer f = Builder $ \k buf -> f buf >>= k
{-# INLINE withBuffer #-}
-- | Get the size of the buffer
withSize :: (Int -> Builder) -> Builder
withSize f = Builder $ \ k buf@(Buffer _ _ _ l) ->
runBuilder (f l) k buf
{-# INLINE withSize #-}
-- | Map the resulting list of texts.
mapBuilder :: ([S.Text] -> [S.Text]) -> Builder
mapBuilder f = Builder (fmap f .)
------------------------------------------------------------------------
-- | Ensure that there are at least @n@ many elements available.
ensureFree :: Int -> Builder
ensureFree !n = withSize $ \ l ->
if n <= l
then empty
else flush `append'` withBuffer (const (newBuffer (max n smallChunkSize)))
{-# INLINE [0] ensureFree #-}
writeAtMost :: Int -> (forall s. A.MArray s -> Int -> ST s Int) -> Builder
writeAtMost n f = ensureFree n `append'` withBuffer (writeBuffer f)
{-# INLINE [0] writeAtMost #-}
-- | Ensure that @n@ many elements are available, and then use @f@ to
-- write some elements into the memory.
writeN :: Int -> (forall s. A.MArray s -> Int -> ST s ()) -> Builder
writeN n f = writeAtMost n (\ p o -> f p o >> return n)
{-# INLINE writeN #-}
writeBuffer :: (A.MArray s -> Int -> ST s Int) -> Buffer s -> ST s (Buffer s)
writeBuffer f (Buffer p o u l) = do
n <- f p (o+u)
return $! Buffer p o (u+n) (l-n)
{-# INLINE writeBuffer #-}
newBuffer :: Int -> ST s (Buffer s)
newBuffer size = do
arr <- A.new size
return $! Buffer arr 0 0 size
{-# INLINE newBuffer #-}
------------------------------------------------------------------------
-- Some nice rules for Builder
-- This function makes GHC understand that 'writeN' and 'ensureFree'
-- are *not* recursive in the precense of the rewrite rules below.
-- This is not needed with GHC 7+.
append' :: Builder -> Builder -> Builder
append' (Builder f) (Builder g) = Builder (f . g)
{-# INLINE append' #-}
{-# RULES
"append/writeAtMost" forall a b (f::forall s. A.MArray s -> Int -> ST s Int)
(g::forall s. A.MArray s -> Int -> ST s Int) ws.
append (writeAtMost a f) (append (writeAtMost b g) ws) =
append (writeAtMost (a+b) (\marr o -> f marr o >>= \ n ->
g marr (o+n) >>= \ m ->
let s = n+m in s `seq` return s)) ws
"writeAtMost/writeAtMost" forall a b (f::forall s. A.MArray s -> Int -> ST s Int)
(g::forall s. A.MArray s -> Int -> ST s Int).
append (writeAtMost a f) (writeAtMost b g) =
writeAtMost (a+b) (\marr o -> f marr o >>= \ n ->
g marr (o+n) >>= \ m ->
let s = n+m in s `seq` return s)
"ensureFree/ensureFree" forall a b .
append (ensureFree a) (ensureFree b) = ensureFree (max a b)
"flush/flush"
append flush flush = flush
#-}

View File

@ -1,40 +0,0 @@
{-# LANGUAGE MagicHash #-}
-- |
-- Module : Data.Text.Internal.Builder.Functions
-- Copyright : (c) 2011 MailRank, Inc.
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Useful functions and combinators.
module Data.Text.Internal.Builder.Functions
(
(<>)
, i2d
) where
import Data.Monoid (mappend)
import Data.Text.Lazy.Builder (Builder)
import GHC.Base (chr#,ord#,(+#),Int(I#),Char(C#))
import Prelude ()
-- | Unsafe conversion for decimal digits.
{-# INLINE i2d #-}
i2d :: Int -> Char
i2d (I# i#) = C# (chr# (ord# '0'# +# i#))
-- | The normal 'mappend' function with right associativity instead of
-- left.
(<>) :: Builder -> Builder -> Builder
(<>) = mappend
{-# INLINE (<>) #-}
infixr 4 <>

View File

@ -1,26 +0,0 @@
{-# LANGUAGE OverloadedStrings #-}
-- Module: Data.Text.Internal.Builder.Int.Digits
-- Copyright: (c) 2013 Bryan O'Sullivan
-- License: BSD-style
-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
-- Stability: experimental
-- Portability: portable
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- This module exists because the C preprocessor does things that we
-- shall not speak of when confronted with Haskell multiline strings.
module Data.Text.Internal.Builder.Int.Digits (digits) where
import Data.ByteString.Char8 (ByteString)
digits :: ByteString
digits = "0001020304050607080910111213141516171819\
\2021222324252627282930313233343536373839\
\4041424344454647484950515253545556575859\
\6061626364656667686970717273747576777879\
\8081828384858687888990919293949596979899"

View File

@ -1,57 +0,0 @@
{-# LANGUAGE CPP #-}
-- |
-- Module: Data.Text.Internal.Builder.RealFloat.Functions
-- Copyright: (c) The University of Glasgow 1994-2002
-- License: see libraries/base/LICENSE
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
module Data.Text.Internal.Builder.RealFloat.Functions
(
roundTo
) where
roundTo :: Int -> [Int] -> (Int,[Int])
#if MIN_VERSION_base(4,6,0)
roundTo d is =
case f d True is of
x@(0,_) -> x
(1,xs) -> (1, 1:xs)
_ -> error "roundTo: bad Value"
where
b2 = base `quot` 2
f n _ [] = (0, replicate n 0)
f 0 e (x:xs) | x == b2 && e && all (== 0) xs = (0, []) -- Round to even when at exactly half the base
| otherwise = (if x >= b2 then 1 else 0, [])
f n _ (i:xs)
| i' == base = (1,0:ds)
| otherwise = (0,i':ds)
where
(c,ds) = f (n-1) (even i) xs
i' = c + i
base = 10
#else
roundTo d is =
case f d is of
x@(0,_) -> x
(1,xs) -> (1, 1:xs)
_ -> error "roundTo: bad Value"
where
f n [] = (0, replicate n 0)
f 0 (x:_) = (if x >= 5 then 1 else 0, [])
f n (i:xs)
| i' == 10 = (1,0:ds)
| otherwise = (0,i':ds)
where
(c,ds) = f (n-1) xs
i' = c + i
#endif

View File

@ -1,208 +0,0 @@
{-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
-- |
-- Module : Data.Text.Internal.Encoding.Fusion
-- Copyright : (c) Tom Harper 2008-2009,
-- (c) Bryan O'Sullivan 2009,
-- (c) Duncan Coutts 2009
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : portable
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Fusible 'Stream'-oriented functions for converting between 'Text'
-- and several common encodings.
module Data.Text.Internal.Encoding.Fusion
(
-- * Streaming
streamASCII
, streamUtf8
, streamUtf16LE
, streamUtf16BE
, streamUtf32LE
, streamUtf32BE
-- * Unstreaming
, unstream
, module Data.Text.Internal.Encoding.Fusion.Common
) where
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy)
import Data.Text.Internal.Fusion (Step(..), Stream(..))
import Data.Text.Internal.Fusion.Size
import Data.Text.Encoding.Error
import Data.Text.Internal.Encoding.Fusion.Common
import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeChr8, unsafeChr32)
import Data.Text.Internal.Unsafe.Shift (shiftL, shiftR)
import Data.Word (Word8, Word16, Word32)
import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
import Foreign.Storable (pokeByteOff)
import qualified Data.ByteString as B
import qualified Data.ByteString.Unsafe as B
import qualified Data.Text.Internal.Encoding.Utf8 as U8
import qualified Data.Text.Internal.Encoding.Utf16 as U16
import qualified Data.Text.Internal.Encoding.Utf32 as U32
import Data.Text.Unsafe (unsafeDupablePerformIO)
streamASCII :: ByteString -> Stream Char
streamASCII bs = Stream next 0 (maxSize l)
where
l = B.length bs
{-# INLINE next #-}
next i
| i >= l = Done
| otherwise = Yield (unsafeChr8 x1) (i+1)
where
x1 = B.unsafeIndex bs i
{-# DEPRECATED streamASCII "Do not use this function" #-}
{-# INLINE [0] streamASCII #-}
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using UTF-8
-- encoding.
streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
streamUtf8 onErr bs = Stream next 0 (maxSize l)
where
l = B.length bs
next i
| i >= l = Done
| U8.validate1 x1 = Yield (unsafeChr8 x1) (i+1)
| i+1 < l && U8.validate2 x1 x2 = Yield (U8.chr2 x1 x2) (i+2)
| i+2 < l && U8.validate3 x1 x2 x3 = Yield (U8.chr3 x1 x2 x3) (i+3)
| i+3 < l && U8.validate4 x1 x2 x3 x4 = Yield (U8.chr4 x1 x2 x3 x4) (i+4)
| otherwise = decodeError "streamUtf8" "UTF-8" onErr (Just x1) (i+1)
where
x1 = idx i
x2 = idx (i + 1)
x3 = idx (i + 2)
x4 = idx (i + 3)
idx = B.unsafeIndex bs
{-# INLINE [0] streamUtf8 #-}
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
-- endian UTF-16 encoding.
streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
streamUtf16LE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
where
l = B.length bs
{-# INLINE next #-}
next i
| i >= l = Done
| i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2)
| i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
| otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1)
where
x1 = idx i + (idx (i + 1) `shiftL` 8)
x2 = idx (i + 2) + (idx (i + 3) `shiftL` 8)
idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16
{-# INLINE [0] streamUtf16LE #-}
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
-- endian UTF-16 encoding.
streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
streamUtf16BE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
where
l = B.length bs
{-# INLINE next #-}
next i
| i >= l = Done
| i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2)
| i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
| otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1)
where
x1 = (idx i `shiftL` 8) + idx (i + 1)
x2 = (idx (i + 2) `shiftL` 8) + idx (i + 3)
idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16
{-# INLINE [0] streamUtf16BE #-}
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
-- endian UTF-32 encoding.
streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
streamUtf32BE onErr bs = Stream next 0 (maxSize (l `shiftR` 2))
where
l = B.length bs
{-# INLINE next #-}
next i
| i >= l = Done
| i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
| otherwise = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing (i+1)
where
x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
x1 = idx i
x2 = idx (i+1)
x3 = idx (i+2)
x4 = idx (i+3)
idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32
{-# INLINE [0] streamUtf32BE #-}
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
-- endian UTF-32 encoding.
streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
streamUtf32LE onErr bs = Stream next 0 (maxSize (l `shiftR` 2))
where
l = B.length bs
{-# INLINE next #-}
next i
| i >= l = Done
| i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
| otherwise = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing (i+1)
where
x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
x1 = idx i
x2 = idx $ i+1
x3 = idx $ i+2
x4 = idx $ i+3
idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32
{-# INLINE [0] streamUtf32LE #-}
-- | /O(n)/ Convert a 'Stream' 'Word8' to a 'ByteString'.
unstream :: Stream Word8 -> ByteString
unstream (Stream next s0 len) = unsafeDupablePerformIO $ do
let mlen = upperBound 4 len
mallocByteString mlen >>= loop mlen 0 s0
where
loop !n !off !s fp = case next s of
Done -> trimUp fp n off
Skip s' -> loop n off s' fp
Yield x s'
| off == n -> realloc fp n off s' x
| otherwise -> do
withForeignPtr fp $ \p -> pokeByteOff p off x
loop n (off+1) s' fp
{-# NOINLINE realloc #-}
realloc fp n off s x = do
let n' = n+n
fp' <- copy0 fp n n'
withForeignPtr fp' $ \p -> pokeByteOff p off x
loop n' (off+1) s fp'
{-# NOINLINE trimUp #-}
trimUp fp _ off = return $! PS fp 0 off
copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
copy0 !src !srcLen !destLen =
#if defined(ASSERTS)
assert (srcLen <= destLen) $
#endif
do
dest <- mallocByteString destLen
withForeignPtr src $ \src' ->
withForeignPtr dest $ \dest' ->
memcpy dest' src' (fromIntegral srcLen)
return dest
decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
-> s -> Step s Char
decodeError func kind onErr mb i =
case onErr desc mb of
Nothing -> Skip i
Just c -> Yield c i
where desc = "Data.Text.Internal.Encoding.Fusion." ++ func ++ ": Invalid " ++
kind ++ " stream"

View File

@ -1,123 +0,0 @@
{-# LANGUAGE BangPatterns #-}
-- |
-- Module : Data.Text.Internal.Encoding.Fusion.Common
-- Copyright : (c) Tom Harper 2008-2009,
-- (c) Bryan O'Sullivan 2009,
-- (c) Duncan Coutts 2009,
-- (c) Jasper Van der Jeugt 2011
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : portable
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Use at your own risk!
--
-- Fusible 'Stream'-oriented functions for converting between 'Text'
-- and several common encodings.
module Data.Text.Internal.Encoding.Fusion.Common
(
-- * Restreaming
-- Restreaming is the act of converting from one 'Stream'
-- representation to another.
restreamUtf16LE
, restreamUtf16BE
, restreamUtf32LE
, restreamUtf32BE
) where
import Data.Bits ((.&.))
import Data.Text.Internal.Fusion (Step(..), Stream(..))
import Data.Text.Internal.Fusion.Types (RS(..))
import Data.Text.Internal.Unsafe.Char (ord)
import Data.Text.Internal.Unsafe.Shift (shiftR)
import Data.Word (Word8)
restreamUtf16BE :: Stream Char -> Stream Word8
restreamUtf16BE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
where
next (RS0 s) = case next0 s of
Done -> Done
Skip s' -> Skip (RS0 s')
Yield x s'
| n < 0x10000 -> Yield (fromIntegral $ n `shiftR` 8) $
RS1 s' (fromIntegral n)
| otherwise -> Yield c1 $ RS3 s' c2 c3 c4
where
n = ord x
n1 = n - 0x10000
c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
c2 = fromIntegral (n1 `shiftR` 10)
n2 = n1 .&. 0x3FF
c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
c4 = fromIntegral n2
next (RS1 s x2) = Yield x2 (RS0 s)
next (RS2 s x2 x3) = Yield x2 (RS1 s x3)
next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
{-# INLINE next #-}
{-# INLINE restreamUtf16BE #-}
restreamUtf16LE :: Stream Char -> Stream Word8
restreamUtf16LE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
where
next (RS0 s) = case next0 s of
Done -> Done
Skip s' -> Skip (RS0 s')
Yield x s'
| n < 0x10000 -> Yield (fromIntegral n) $
RS1 s' (fromIntegral $ shiftR n 8)
| otherwise -> Yield c1 $ RS3 s' c2 c3 c4
where
n = ord x
n1 = n - 0x10000
c2 = fromIntegral (shiftR n1 18 + 0xD8)
c1 = fromIntegral (shiftR n1 10)
n2 = n1 .&. 0x3FF
c4 = fromIntegral (shiftR n2 8 + 0xDC)
c3 = fromIntegral n2
next (RS1 s x2) = Yield x2 (RS0 s)
next (RS2 s x2 x3) = Yield x2 (RS1 s x3)
next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
{-# INLINE next #-}
{-# INLINE restreamUtf16LE #-}
restreamUtf32BE :: Stream Char -> Stream Word8
restreamUtf32BE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
where
next (RS0 s) = case next0 s of
Done -> Done
Skip s' -> Skip (RS0 s')
Yield x s' -> Yield c1 (RS3 s' c2 c3 c4)
where
n = ord x
c1 = fromIntegral $ shiftR n 24
c2 = fromIntegral $ shiftR n 16
c3 = fromIntegral $ shiftR n 8
c4 = fromIntegral n
next (RS1 s x2) = Yield x2 (RS0 s)
next (RS2 s x2 x3) = Yield x2 (RS1 s x3)
next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
{-# INLINE next #-}
{-# INLINE restreamUtf32BE #-}
restreamUtf32LE :: Stream Char -> Stream Word8
restreamUtf32LE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
where
next (RS0 s) = case next0 s of
Done -> Done
Skip s' -> Skip (RS0 s')
Yield x s' -> Yield c1 (RS3 s' c2 c3 c4)
where
n = ord x
c4 = fromIntegral $ shiftR n 24
c3 = fromIntegral $ shiftR n 16
c2 = fromIntegral $ shiftR n 8
c1 = fromIntegral n
next (RS1 s x2) = Yield x2 (RS0 s)
next (RS2 s x2 x3) = Yield x2 (RS1 s x3)
next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
{-# INLINE next #-}
{-# INLINE restreamUtf32LE #-}

View File

@ -1,45 +0,0 @@
{-# LANGUAGE MagicHash, BangPatterns #-}
-- |
-- Module : Data.Text.Internal.Encoding.Utf16
-- Copyright : (c) 2008, 2009 Tom Harper,
-- (c) 2009 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Basic UTF-16 validation and character manipulation.
module Data.Text.Internal.Encoding.Utf16
(
chr2
, validate1
, validate2
) where
import GHC.Exts
import GHC.Word (Word16(..))
chr2 :: Word16 -> Word16 -> Char
chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#))
where
!x# = word2Int# a#
!y# = word2Int# b#
!upper# = uncheckedIShiftL# (x# -# 0xD800#) 10#
!lower# = y# -# 0xDC00#
{-# INLINE chr2 #-}
validate1 :: Word16 -> Bool
validate1 x1 = x1 < 0xD800 || x1 > 0xDFFF
{-# INLINE validate1 #-}
validate2 :: Word16 -> Word16 -> Bool
validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
x2 >= 0xDC00 && x2 <= 0xDFFF
{-# INLINE validate2 #-}

View File

@ -1,26 +0,0 @@
-- |
-- Module : Data.Text.Internal.Encoding.Utf32
-- Copyright : (c) 2008, 2009 Tom Harper,
-- (c) 2009, 2010 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : portable
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Basic UTF-32 validation.
module Data.Text.Internal.Encoding.Utf32
(
validate
) where
import Data.Word (Word32)
validate :: Word32 -> Bool
validate x1 = x1 < 0xD800 || (x1 > 0xDFFF && x1 <= 0x10FFFF)
{-# INLINE validate #-}

View File

@ -1,168 +0,0 @@
{-# LANGUAGE CPP, MagicHash, BangPatterns #-}
-- |
-- Module : Data.Text.Internal.Encoding.Utf8
-- Copyright : (c) 2008, 2009 Tom Harper,
-- (c) 2009, 2010 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Basic UTF-8 validation and character manipulation.
module Data.Text.Internal.Encoding.Utf8
(
-- Decomposition
ord2
, ord3
, ord4
-- Construction
, chr2
, chr3
, chr4
-- * Validation
, validate1
, validate2
, validate3
, validate4
) where
#if defined(TEST_SUITE)
# undef ASSERTS
#endif
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
import Data.Bits ((.&.))
import Data.Text.Internal.Unsafe.Char (ord)
import Data.Text.Internal.Unsafe.Shift (shiftR)
import GHC.Exts
import GHC.Word (Word8(..))
default(Int)
between :: Word8 -- ^ byte to check
-> Word8 -- ^ lower bound
-> Word8 -- ^ upper bound
-> Bool
between x y z = x >= y && x <= z
{-# INLINE between #-}
ord2 :: Char -> (Word8,Word8)
ord2 c =
#if defined(ASSERTS)
assert (n >= 0x80 && n <= 0x07ff)
#endif
(x1,x2)
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
x2 = fromIntegral $ (n .&. 0x3F) + 0x80
ord3 :: Char -> (Word8,Word8,Word8)
ord3 c =
#if defined(ASSERTS)
assert (n >= 0x0800 && n <= 0xffff)
#endif
(x1,x2,x3)
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
x3 = fromIntegral $ (n .&. 0x3F) + 0x80
ord4 :: Char -> (Word8,Word8,Word8,Word8)
ord4 c =
#if defined(ASSERTS)
assert (n >= 0x10000)
#endif
(x1,x2,x3,x4)
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
x4 = fromIntegral $ (n .&. 0x3F) + 0x80
chr2 :: Word8 -> Word8 -> Char
chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
where
!y1# = word2Int# x1#
!y2# = word2Int# x2#
!z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
!z2# = y2# -# 0x80#
{-# INLINE chr2 #-}
chr3 :: Word8 -> Word8 -> Word8 -> Char
chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
where
!y1# = word2Int# x1#
!y2# = word2Int# x2#
!y3# = word2Int# x3#
!z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
!z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
!z3# = y3# -# 0x80#
{-# INLINE chr3 #-}
chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
C# (chr# (z1# +# z2# +# z3# +# z4#))
where
!y1# = word2Int# x1#
!y2# = word2Int# x2#
!y3# = word2Int# x3#
!y4# = word2Int# x4#
!z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
!z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
!z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
!z4# = y4# -# 0x80#
{-# INLINE chr4 #-}
validate1 :: Word8 -> Bool
validate1 x1 = x1 <= 0x7F
{-# INLINE validate1 #-}
validate2 :: Word8 -> Word8 -> Bool
validate2 x1 x2 = between x1 0xC2 0xDF && between x2 0x80 0xBF
{-# INLINE validate2 #-}
validate3 :: Word8 -> Word8 -> Word8 -> Bool
{-# INLINE validate3 #-}
validate3 x1 x2 x3 = validate3_1 || validate3_2 || validate3_3 || validate3_4
where
validate3_1 = (x1 == 0xE0) &&
between x2 0xA0 0xBF &&
between x3 0x80 0xBF
validate3_2 = between x1 0xE1 0xEC &&
between x2 0x80 0xBF &&
between x3 0x80 0xBF
validate3_3 = x1 == 0xED &&
between x2 0x80 0x9F &&
between x3 0x80 0xBF
validate3_4 = between x1 0xEE 0xEF &&
between x2 0x80 0xBF &&
between x3 0x80 0xBF
validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
{-# INLINE validate4 #-}
validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3
where
validate4_1 = x1 == 0xF0 &&
between x2 0x90 0xBF &&
between x3 0x80 0xBF &&
between x4 0x80 0xBF
validate4_2 = between x1 0xF1 0xF3 &&
between x2 0x80 0xBF &&
between x3 0x80 0xBF &&
between x4 0x80 0xBF
validate4_3 = x1 == 0xF4 &&
between x2 0x80 0x8F &&
between x3 0x80 0xBF &&
between x4 0x80 0xBF

View File

@ -1,29 +0,0 @@
-- |
-- Module : Data.Text.Internal.Functions
-- Copyright : 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Useful functions.
module Data.Text.Internal.Functions
(
intersperse
) where
-- | A lazier version of Data.List.intersperse. The other version
-- causes space leaks!
intersperse :: a -> [a] -> [a]
intersperse _ [] = []
intersperse sep (x:xs) = x : go xs
where
go [] = []
go (y:ys) = sep : y: go ys
{-# INLINE intersperse #-}

View File

@ -1,244 +0,0 @@
{-# LANGUAGE BangPatterns, MagicHash #-}
-- |
-- Module : Data.Text.Internal.Fusion
-- Copyright : (c) Tom Harper 2008-2009,
-- (c) Bryan O'Sullivan 2009-2010,
-- (c) Duncan Coutts 2009
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Text manipulation functions represented as fusible operations over
-- streams.
module Data.Text.Internal.Fusion
(
-- * Types
Stream(..)
, Step(..)
-- * Creation and elimination
, stream
, unstream
, reverseStream
, length
-- * Transformations
, reverse
-- * Construction
-- ** Scans
, reverseScanr
-- ** Accumulating maps
, mapAccumL
-- ** Generation and unfolding
, unfoldrN
-- * Indexing
, index
, findIndex
, countChar
) where
import Prelude (Bool(..), Char, Maybe(..), Monad(..), Int,
Num(..), Ord(..), ($), (&&),
fromIntegral, otherwise)
import Data.Bits ((.&.))
import Data.Text.Internal (Text(..))
import Data.Text.Internal.Private (runText)
import Data.Text.Internal.Unsafe.Char (ord, unsafeChr, unsafeWrite)
import Data.Text.Internal.Unsafe.Shift (shiftL, shiftR)
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.Fusion.Common as S
import Data.Text.Internal.Fusion.Types
import Data.Text.Internal.Fusion.Size
import qualified Data.Text.Internal as I
import qualified Data.Text.Internal.Encoding.Utf16 as U16
default(Int)
-- | /O(n)/ Convert a 'Text' into a 'Stream Char'.
stream :: Text -> Stream Char
stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
where
!end = off+len
next !i
| i >= end = Done
| n >= 0xD800 && n <= 0xDBFF = Yield (U16.chr2 n n2) (i + 2)
| otherwise = Yield (unsafeChr n) (i + 1)
where
n = A.unsafeIndex arr i
n2 = A.unsafeIndex arr (i + 1)
{-# INLINE [0] stream #-}
-- | /O(n)/ Convert a 'Text' into a 'Stream Char', but iterate
-- backwards.
reverseStream :: Text -> Stream Char
reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `shiftR` 1) len)
where
{-# INLINE next #-}
next !i
| i < off = Done
| n >= 0xDC00 && n <= 0xDFFF = Yield (U16.chr2 n2 n) (i - 2)
| otherwise = Yield (unsafeChr n) (i - 1)
where
n = A.unsafeIndex arr i
n2 = A.unsafeIndex arr (i - 1)
{-# INLINE [0] reverseStream #-}
-- | /O(n)/ Convert a 'Stream Char' into a 'Text'.
unstream :: Stream Char -> Text
unstream (Stream next0 s0 len) = runText $ \done -> do
-- Before encoding each char we perform a buffer realloc check assuming
-- worst case encoding size of two 16-bit units for the char. Just add an
-- extra space to the buffer so that we do not end up reallocating even when
-- all the chars are encoded as single unit.
let mlen = upperBound 4 len + 1
arr0 <- A.new mlen
let outer !arr !maxi = encode
where
-- keep the common case loop as small as possible
encode !si !di =
case next0 si of
Done -> done arr di
Skip si' -> encode si' di
Yield c si'
-- simply check for the worst case
| maxi < di + 1 -> realloc si di
| otherwise -> do
n <- unsafeWrite arr di c
encode si' (di + n)
-- keep uncommon case separate from the common case code
{-# NOINLINE realloc #-}
realloc !si !di = do
let newlen = (maxi + 1) * 2
arr' <- A.new newlen
A.copyM arr' 0 arr 0 di
outer arr' (newlen - 1) si di
outer arr0 (mlen - 1) s0 0
{-# INLINE [0] unstream #-}
{-# RULES "STREAM stream/unstream fusion" forall s. stream (unstream s) = s #-}
-- ----------------------------------------------------------------------------
-- * Basic stream functions
length :: Stream Char -> Int
length = S.lengthI
{-# INLINE[0] length #-}
-- | /O(n)/ Reverse the characters of a string.
reverse :: Stream Char -> Text
reverse (Stream next s len0)
| isEmpty len0 = I.empty
| otherwise = I.text arr off' len'
where
len0' = upperBound 4 (larger len0 4)
(arr, (off', len')) = A.run2 (A.new len0' >>= loop s (len0'-1) len0')
loop !s0 !i !len marr =
case next s0 of
Done -> return (marr, (j, len-j))
where j = i + 1
Skip s1 -> loop s1 i len marr
Yield x s1 | i < least -> {-# SCC "reverse/resize" #-} do
let newLen = len `shiftL` 1
marr' <- A.new newLen
A.copyM marr' (newLen-len) marr 0 len
write s1 (len+i) newLen marr'
| otherwise -> write s1 i len marr
where n = ord x
least | n < 0x10000 = 0
| otherwise = 1
m = n - 0x10000
lo = fromIntegral $ (m `shiftR` 10) + 0xD800
hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
write t j l mar
| n < 0x10000 = do
A.unsafeWrite mar j (fromIntegral n)
loop t (j-1) l mar
| otherwise = do
A.unsafeWrite mar (j-1) lo
A.unsafeWrite mar j hi
loop t (j-2) l mar
{-# INLINE [0] reverse #-}
-- | /O(n)/ Perform the equivalent of 'scanr' over a list, only with
-- the input and result reversed.
reverseScanr :: (Char -> Char -> Char) -> Char -> Stream Char -> Stream Char
reverseScanr f z0 (Stream next0 s0 len) = Stream next (Scan1 z0 s0) (len+1) -- HINT maybe too low
where
{-# INLINE next #-}
next (Scan1 z s) = Yield z (Scan2 z s)
next (Scan2 z s) = case next0 s of
Yield x s' -> let !x' = f x z
in Yield x' (Scan2 x' s')
Skip s' -> Skip (Scan2 z s')
Done -> Done
{-# INLINE reverseScanr #-}
-- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a stream from a seed
-- value. However, the length of the result is limited by the
-- first argument to 'unfoldrN'. This function is more efficient than
-- 'unfoldr' when the length of the result is known.
unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Stream Char
unfoldrN n = S.unfoldrNI n
{-# INLINE [0] unfoldrN #-}
-------------------------------------------------------------------------------
-- ** Indexing streams
-- | /O(n)/ stream index (subscript) operator, starting from 0.
index :: Stream Char -> Int -> Char
index = S.indexI
{-# INLINE [0] index #-}
-- | The 'findIndex' function takes a predicate and a stream and
-- returns the index of the first element in the stream
-- satisfying the predicate.
findIndex :: (Char -> Bool) -> Stream Char -> Maybe Int
findIndex = S.findIndexI
{-# INLINE [0] findIndex #-}
-- | /O(n)/ The 'count' function returns the number of times the query
-- element appears in the given stream.
countChar :: Char -> Stream Char -> Int
countChar = S.countCharI
{-# INLINE [0] countChar #-}
-- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
-- function to each element of a 'Text', passing an accumulating
-- parameter from left to right, and returns a final 'Text'.
mapAccumL :: (a -> Char -> (a,Char)) -> a -> Stream Char -> (a, Text)
mapAccumL f z0 (Stream next0 s0 len) = (nz, I.text na 0 nl)
where
(na,(nz,nl)) = A.run2 (A.new mlen >>= \arr -> outer arr mlen z0 s0 0)
where mlen = upperBound 4 len
outer arr top = loop
where
loop !z !s !i =
case next0 s of
Done -> return (arr, (z,i))
Skip s' -> loop z s' i
Yield x s'
| j >= top -> {-# SCC "mapAccumL/resize" #-} do
let top' = (top + 1) `shiftL` 1
arr' <- A.new top'
A.copyM arr' 0 arr 0 top
outer arr' top' z s i
| otherwise -> do d <- unsafeWrite arr i c
loop z' s' (i+d)
where (z',c) = f z x
j | ord c < 0x10000 = i
| otherwise = i + 1
{-# INLINE [0] mapAccumL #-}

View File

@ -1,949 +0,0 @@
{-# LANGUAGE BangPatterns, MagicHash, Rank2Types #-}
-- |
-- Module : Data.Text.Internal.Fusion.Common
-- Copyright : (c) Bryan O'Sullivan 2009, 2012
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Common stream fusion functionality for text.
module Data.Text.Internal.Fusion.Common
(
-- * Creation and elimination
singleton
, streamList
, unstreamList
, streamCString#
-- * Basic interface
, cons
, snoc
, append
, head
, uncons
, last
, tail
, init
, null
, lengthI
, compareLengthI
, isSingleton
-- * Transformations
, map
, intercalate
, intersperse
-- ** Case conversion
-- $case
, toCaseFold
, toLower
, toTitle
, toUpper
-- ** Justification
, justifyLeftI
-- * Folds
, foldl
, foldl'
, foldl1
, foldl1'
, foldr
, foldr1
-- ** Special folds
, concat
, concatMap
, any
, all
, maximum
, minimum
-- * Construction
-- ** Scans
, scanl
-- ** Generation and unfolding
, replicateCharI
, replicateI
, unfoldr
, unfoldrNI
-- * Substrings
-- ** Breaking strings
, take
, drop
, takeWhile
, dropWhile
-- * Predicates
, isPrefixOf
-- * Searching
, elem
, filter
-- * Indexing
, findBy
, indexI
, findIndexI
, countCharI
-- * Zipping and unzipping
, zipWith
) where
import Prelude (Bool(..), Char, Eq(..), Int, Integral, Maybe(..),
Ord(..), Ordering(..), String, (.), ($), (+), (-), (*), (++),
(&&), fromIntegral, otherwise)
import qualified Data.List as L
import qualified Prelude as P
import Data.Bits (shiftL)
import Data.Char (isLetter, isSpace)
import Data.Int (Int64)
import Data.Text.Internal.Fusion.Types
import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, titleMapping,
upperMapping)
import Data.Text.Internal.Fusion.Size
import GHC.Prim (Addr#, chr#, indexCharOffAddr#, ord#)
import GHC.Types (Char(..), Int(..))
singleton :: Char -> Stream Char
singleton c = Stream next False (codePointsSize 1)
where next False = Yield c True
next True = Done
{-# INLINE [0] singleton #-}
streamList :: [a] -> Stream a
{-# INLINE [0] streamList #-}
streamList s = Stream next s unknownSize
where next [] = Done
next (x:xs) = Yield x xs
unstreamList :: Stream a -> [a]
unstreamList (Stream next s0 _len) = unfold s0
where unfold !s = case next s of
Done -> []
Skip s' -> unfold s'
Yield x s' -> x : unfold s'
{-# INLINE [0] unstreamList #-}
{-# RULES "STREAM streamList/unstreamList fusion" forall s. streamList (unstreamList s) = s #-}
-- | Stream the UTF-8-like packed encoding used by GHC to represent
-- constant strings in generated code.
--
-- This encoding uses the byte sequence "\xc0\x80" to represent NUL,
-- and the string is NUL-terminated.
streamCString# :: Addr# -> Stream Char
streamCString# addr = Stream step 0 unknownSize
where
step !i
| b == 0 = Done
| b <= 0x7f = Yield (C# b#) (i+1)
| b <= 0xdf = let !c = chr $ ((b-0xc0) `shiftL` 6) + next 1
in Yield c (i+2)
| b <= 0xef = let !c = chr $ ((b-0xe0) `shiftL` 12) +
(next 1 `shiftL` 6) +
next 2
in Yield c (i+3)
| otherwise = let !c = chr $ ((b-0xf0) `shiftL` 18) +
(next 1 `shiftL` 12) +
(next 2 `shiftL` 6) +
next 3
in Yield c (i+4)
where b = I# (ord# b#)
next n = I# (ord# (at# (i+n))) - 0x80
!b# = at# i
at# (I# i#) = indexCharOffAddr# addr i#
chr (I# i#) = C# (chr# i#)
{-# INLINE [0] streamCString# #-}
-- ----------------------------------------------------------------------------
-- * Basic stream functions
data C s = C0 !s
| C1 !s
-- | /O(n)/ Adds a character to the front of a Stream Char.
cons :: Char -> Stream Char -> Stream Char
cons !w (Stream next0 s0 len) = Stream next (C1 s0) (len + codePointsSize 1)
where
next (C1 s) = Yield w (C0 s)
next (C0 s) = case next0 s of
Done -> Done
Skip s' -> Skip (C0 s')
Yield x s' -> Yield x (C0 s')
{-# INLINE [0] cons #-}
data Snoc a = N
| J !a
-- | /O(n)/ Adds a character to the end of a stream.
snoc :: Stream Char -> Char -> Stream Char
snoc (Stream next0 xs0 len) w = Stream next (J xs0) (len + codePointsSize 1)
where
next (J xs) = case next0 xs of
Done -> Yield w N
Skip xs' -> Skip (J xs')
Yield x xs' -> Yield x (J xs')
next N = Done
{-# INLINE [0] snoc #-}
data E l r = L !l
| R !r
-- | /O(n)/ Appends one Stream to the other.
append :: Stream Char -> Stream Char -> Stream Char
append (Stream next0 s01 len1) (Stream next1 s02 len2) =
Stream next (L s01) (len1 + len2)
where
next (L s1) = case next0 s1 of
Done -> Skip (R s02)
Skip s1' -> Skip (L s1')
Yield x s1' -> Yield x (L s1')
next (R s2) = case next1 s2 of
Done -> Done
Skip s2' -> Skip (R s2')
Yield x s2' -> Yield x (R s2')
{-# INLINE [0] append #-}
-- | /O(1)/ Returns the first character of a Text, which must be non-empty.
-- Subject to array fusion.
head :: Stream Char -> Char
head (Stream next s0 _len) = loop_head s0
where
loop_head !s = case next s of
Yield x _ -> x
Skip s' -> loop_head s'
Done -> head_empty
{-# INLINE [0] head #-}
head_empty :: a
head_empty = streamError "head" "Empty stream"
{-# NOINLINE head_empty #-}
-- | /O(1)/ Returns the first character and remainder of a 'Stream
-- Char', or 'Nothing' if empty. Subject to array fusion.
uncons :: Stream Char -> Maybe (Char, Stream Char)
uncons (Stream next s0 len) = loop_uncons s0
where
loop_uncons !s = case next s of
Yield x s1 -> Just (x, Stream next s1 (len - codePointsSize 1))
Skip s' -> loop_uncons s'
Done -> Nothing
{-# INLINE [0] uncons #-}
-- | /O(n)/ Returns the last character of a 'Stream Char', which must
-- be non-empty.
last :: Stream Char -> Char
last (Stream next s0 _len) = loop0_last s0
where
loop0_last !s = case next s of
Done -> emptyError "last"
Skip s' -> loop0_last s'
Yield x s' -> loop_last x s'
loop_last !x !s = case next s of
Done -> x
Skip s' -> loop_last x s'
Yield x' s' -> loop_last x' s'
{-# INLINE[0] last #-}
-- | /O(1)/ Returns all characters after the head of a Stream Char, which must
-- be non-empty.
tail :: Stream Char -> Stream Char
tail (Stream next0 s0 len) = Stream next (C0 s0) (len - codePointsSize 1)
where
next (C0 s) = case next0 s of
Done -> emptyError "tail"
Skip s' -> Skip (C0 s')
Yield _ s' -> Skip (C1 s')
next (C1 s) = case next0 s of
Done -> Done
Skip s' -> Skip (C1 s')
Yield x s' -> Yield x (C1 s')
{-# INLINE [0] tail #-}
data Init s = Init0 !s
| Init1 {-# UNPACK #-} !Char !s
-- | /O(1)/ Returns all but the last character of a Stream Char, which
-- must be non-empty.
init :: Stream Char -> Stream Char
init (Stream next0 s0 len) = Stream next (Init0 s0) (len - codePointsSize 1)
where
next (Init0 s) = case next0 s of
Done -> emptyError "init"
Skip s' -> Skip (Init0 s')
Yield x s' -> Skip (Init1 x s')
next (Init1 x s) = case next0 s of
Done -> Done
Skip s' -> Skip (Init1 x s')
Yield x' s' -> Yield x (Init1 x' s')
{-# INLINE [0] init #-}
-- | /O(1)/ Tests whether a Stream Char is empty or not.
null :: Stream Char -> Bool
null (Stream next s0 _len) = loop_null s0
where
loop_null !s = case next s of
Done -> True
Yield _ _ -> False
Skip s' -> loop_null s'
{-# INLINE[0] null #-}
-- | /O(n)/ Returns the number of characters in a string.
lengthI :: Integral a => Stream Char -> a
lengthI (Stream next s0 _len) = loop_length 0 s0
where
loop_length !z s = case next s of
Done -> z
Skip s' -> loop_length z s'
Yield _ s' -> loop_length (z + 1) s'
{-# INLINE[0] lengthI #-}
-- | /O(n)/ Compares the count of characters in a string to a number.
-- Subject to fusion.
--
-- This function gives the same answer as comparing against the result
-- of 'lengthI', but can short circuit if the count of characters is
-- greater than the number or if the stream can't possibly be as long
-- as the number supplied, and hence be more efficient.
compareLengthI :: Integral a => Stream Char -> a -> Ordering
compareLengthI (Stream next s0 len) n
-- Note that @len@ tracks code units whereas we want to compare the length
-- in code points. Specifically, a stream with hint @len@ may consist of
-- anywhere from @len/2@ to @len@ code points.
| Just r <- compareSize len n' = r
| otherwise = loop_cmp 0 s0
where
n' = codePointsSize $ fromIntegral n
loop_cmp !z s = case next s of
Done -> compare z n
Skip s' -> loop_cmp z s'
Yield _ s' | z > n -> GT
| otherwise -> loop_cmp (z + 1) s'
{-# INLINE[0] compareLengthI #-}
-- | /O(n)/ Indicate whether a string contains exactly one element.
isSingleton :: Stream Char -> Bool
isSingleton (Stream next s0 _len) = loop 0 s0
where
loop !z s = case next s of
Done -> z == (1::Int)
Skip s' -> loop z s'
Yield _ s'
| z >= 1 -> False
| otherwise -> loop (z+1) s'
{-# INLINE[0] isSingleton #-}
-- ----------------------------------------------------------------------------
-- * Stream transformations
-- | /O(n)/ 'map' @f @xs is the Stream Char obtained by applying @f@
-- to each element of @xs@.
map :: (Char -> Char) -> Stream Char -> Stream Char
map f (Stream next0 s0 len) = Stream next s0 len
where
next !s = case next0 s of
Done -> Done
Skip s' -> Skip s'
Yield x s' -> Yield (f x) s'
{-# INLINE [0] map #-}
{-#
RULES "STREAM map/map fusion" forall f g s.
map f (map g s) = map (\x -> f (g x)) s
#-}
data I s = I1 !s
| I2 !s {-# UNPACK #-} !Char
| I3 !s
-- | /O(n)/ Take a character and place it between each of the
-- characters of a 'Stream Char'.
intersperse :: Char -> Stream Char -> Stream Char
intersperse c (Stream next0 s0 len) = Stream next (I1 s0) (len + unknownSize)
where
next (I1 s) = case next0 s of
Done -> Done
Skip s' -> Skip (I1 s')
Yield x s' -> Skip (I2 s' x)
next (I2 s x) = Yield x (I3 s)
next (I3 s) = case next0 s of
Done -> Done
Skip s' -> Skip (I3 s')
Yield x s' -> Yield c (I2 s' x)
{-# INLINE [0] intersperse #-}
-- ----------------------------------------------------------------------------
-- ** Case conversions (folds)
-- $case
--
-- With Unicode text, it is incorrect to use combinators like @map
-- toUpper@ to case convert each character of a string individually.
-- Instead, use the whole-string case conversion functions from this
-- module. For correctness in different writing systems, these
-- functions may map one input character to two or three output
-- characters.
-- | Map a 'Stream' through the given case-mapping function.
caseConvert :: (forall s. Char -> s -> Step (CC s) Char)
-> Stream Char -> Stream Char
caseConvert remap (Stream next0 s0 len) =
Stream next (CC s0 '\0' '\0') (len `unionSize` (3*len))
where
next (CC s '\0' _) =
case next0 s of
Done -> Done
Skip s' -> Skip (CC s' '\0' '\0')
Yield c s' -> remap c s'
next (CC s a b) = Yield a (CC s b '\0')
-- | /O(n)/ Convert a string to folded case. This function is mainly
-- useful for performing caseless (or case insensitive) string
-- comparisons.
--
-- A string @x@ is a caseless match for a string @y@ if and only if:
--
-- @toCaseFold x == toCaseFold y@
--
-- The result string may be longer than the input string, and may
-- differ from applying 'toLower' to the input string. For instance,
-- the Armenian small ligature men now (U+FB13) is case folded to the
-- bigram men now (U+0574 U+0576), while the micro sign (U+00B5) is
-- case folded to the Greek small letter letter mu (U+03BC) instead of
-- itself.
toCaseFold :: Stream Char -> Stream Char
toCaseFold = caseConvert foldMapping
{-# INLINE [0] toCaseFold #-}
-- | /O(n)/ Convert a string to upper case, using simple case
-- conversion. The result string may be longer than the input string.
-- For instance, the German eszett (U+00DF) maps to the two-letter
-- sequence SS.
toUpper :: Stream Char -> Stream Char
toUpper = caseConvert upperMapping
{-# INLINE [0] toUpper #-}
-- | /O(n)/ Convert a string to lower case, using simple case
-- conversion. The result string may be longer than the input string.
-- For instance, the Latin capital letter I with dot above (U+0130)
-- maps to the sequence Latin small letter i (U+0069) followed by
-- combining dot above (U+0307).
toLower :: Stream Char -> Stream Char
toLower = caseConvert lowerMapping
{-# INLINE [0] toLower #-}
-- | /O(n)/ Convert a string to title case, using simple case
-- conversion.
--
-- The first letter of the input is converted to title case, as is
-- every subsequent letter that immediately follows a non-letter.
-- Every letter that immediately follows another letter is converted
-- to lower case.
--
-- The result string may be longer than the input string. For example,
-- the Latin small ligature &#xfb02; (U+FB02) is converted to the
-- sequence Latin capital letter F (U+0046) followed by Latin small
-- letter l (U+006C).
--
-- /Note/: this function does not take language or culture specific
-- rules into account. For instance, in English, different style
-- guides disagree on whether the book name \"The Hill of the Red
-- Fox\" is correctly title cased&#x2014;but this function will
-- capitalize /every/ word.
toTitle :: Stream Char -> Stream Char
toTitle (Stream next0 s0 len) = Stream next (CC (False :*: s0) '\0' '\0') (len + unknownSize)
where
next (CC (letter :*: s) '\0' _) =
case next0 s of
Done -> Done
Skip s' -> Skip (CC (letter :*: s') '\0' '\0')
Yield c s'
| nonSpace -> if letter
then lowerMapping c (nonSpace :*: s')
else titleMapping c (letter' :*: s')
| otherwise -> Yield c (CC (letter' :*: s') '\0' '\0')
where nonSpace = P.not (isSpace c)
letter' = isLetter c
next (CC s a b) = Yield a (CC s b '\0')
{-# INLINE [0] toTitle #-}
data Justify i s = Just1 !i !s
| Just2 !i !s
justifyLeftI :: Integral a => a -> Char -> Stream Char -> Stream Char
justifyLeftI k c (Stream next0 s0 len) =
Stream next (Just1 0 s0) (larger (fromIntegral k * charSize c + len) len)
where
next (Just1 n s) =
case next0 s of
Done -> next (Just2 n s)
Skip s' -> Skip (Just1 n s')
Yield x s' -> Yield x (Just1 (n+1) s')
next (Just2 n s)
| n < k = Yield c (Just2 (n+1) s)
| otherwise = Done
{-# INLINE next #-}
{-# INLINE [0] justifyLeftI #-}
-- ----------------------------------------------------------------------------
-- * Reducing Streams (folds)
-- | foldl, applied to a binary operator, a starting value (typically the
-- left-identity of the operator), and a Stream, reduces the Stream using the
-- binary operator, from left to right.
foldl :: (b -> Char -> b) -> b -> Stream Char -> b
foldl f z0 (Stream next s0 _len) = loop_foldl z0 s0
where
loop_foldl z !s = case next s of
Done -> z
Skip s' -> loop_foldl z s'
Yield x s' -> loop_foldl (f z x) s'
{-# INLINE [0] foldl #-}
-- | A strict version of foldl.
foldl' :: (b -> Char -> b) -> b -> Stream Char -> b
foldl' f z0 (Stream next s0 _len) = loop_foldl' z0 s0
where
loop_foldl' !z !s = case next s of
Done -> z
Skip s' -> loop_foldl' z s'
Yield x s' -> loop_foldl' (f z x) s'
{-# INLINE [0] foldl' #-}
-- | foldl1 is a variant of foldl that has no starting value argument,
-- and thus must be applied to non-empty Streams.
foldl1 :: (Char -> Char -> Char) -> Stream Char -> Char
foldl1 f (Stream next s0 _len) = loop0_foldl1 s0
where
loop0_foldl1 !s = case next s of
Skip s' -> loop0_foldl1 s'
Yield x s' -> loop_foldl1 x s'
Done -> emptyError "foldl1"
loop_foldl1 z !s = case next s of
Done -> z
Skip s' -> loop_foldl1 z s'
Yield x s' -> loop_foldl1 (f z x) s'
{-# INLINE [0] foldl1 #-}
-- | A strict version of foldl1.
foldl1' :: (Char -> Char -> Char) -> Stream Char -> Char
foldl1' f (Stream next s0 _len) = loop0_foldl1' s0
where
loop0_foldl1' !s = case next s of
Skip s' -> loop0_foldl1' s'
Yield x s' -> loop_foldl1' x s'
Done -> emptyError "foldl1"
loop_foldl1' !z !s = case next s of
Done -> z
Skip s' -> loop_foldl1' z s'
Yield x s' -> loop_foldl1' (f z x) s'
{-# INLINE [0] foldl1' #-}
-- | 'foldr', applied to a binary operator, a starting value (typically the
-- right-identity of the operator), and a stream, reduces the stream using the
-- binary operator, from right to left.
foldr :: (Char -> b -> b) -> b -> Stream Char -> b
foldr f z (Stream next s0 _len) = loop_foldr s0
where
loop_foldr !s = case next s of
Done -> z
Skip s' -> loop_foldr s'
Yield x s' -> f x (loop_foldr s')
{-# INLINE [0] foldr #-}
-- | foldr1 is a variant of 'foldr' that has no starting value argument,
-- and thus must be applied to non-empty streams.
-- Subject to array fusion.
foldr1 :: (Char -> Char -> Char) -> Stream Char -> Char
foldr1 f (Stream next s0 _len) = loop0_foldr1 s0
where
loop0_foldr1 !s = case next s of
Done -> emptyError "foldr1"
Skip s' -> loop0_foldr1 s'
Yield x s' -> loop_foldr1 x s'
loop_foldr1 x !s = case next s of
Done -> x
Skip s' -> loop_foldr1 x s'
Yield x' s' -> f x (loop_foldr1 x' s')
{-# INLINE [0] foldr1 #-}
intercalate :: Stream Char -> [Stream Char] -> Stream Char
intercalate s = concat . (L.intersperse s)
{-# INLINE [0] intercalate #-}
-- ----------------------------------------------------------------------------
-- ** Special folds
-- | /O(n)/ Concatenate a list of streams. Subject to array fusion.
concat :: [Stream Char] -> Stream Char
concat = L.foldr append empty
{-# INLINE [0] concat #-}
-- | Map a function over a stream that results in a stream and concatenate the
-- results.
concatMap :: (Char -> Stream Char) -> Stream Char -> Stream Char
concatMap f = foldr (append . f) empty
{-# INLINE [0] concatMap #-}
-- | /O(n)/ any @p @xs determines if any character in the stream
-- @xs@ satisfies the predicate @p@.
any :: (Char -> Bool) -> Stream Char -> Bool
any p (Stream next0 s0 _len) = loop_any s0
where
loop_any !s = case next0 s of
Done -> False
Skip s' -> loop_any s'
Yield x s' | p x -> True
| otherwise -> loop_any s'
{-# INLINE [0] any #-}
-- | /O(n)/ all @p @xs determines if all characters in the 'Text'
-- @xs@ satisfy the predicate @p@.
all :: (Char -> Bool) -> Stream Char -> Bool
all p (Stream next0 s0 _len) = loop_all s0
where
loop_all !s = case next0 s of
Done -> True
Skip s' -> loop_all s'
Yield x s' | p x -> loop_all s'
| otherwise -> False
{-# INLINE [0] all #-}
-- | /O(n)/ maximum returns the maximum value from a stream, which must be
-- non-empty.
maximum :: Stream Char -> Char
maximum (Stream next0 s0 _len) = loop0_maximum s0
where
loop0_maximum !s = case next0 s of
Done -> emptyError "maximum"
Skip s' -> loop0_maximum s'
Yield x s' -> loop_maximum x s'
loop_maximum !z !s = case next0 s of
Done -> z
Skip s' -> loop_maximum z s'
Yield x s'
| x > z -> loop_maximum x s'
| otherwise -> loop_maximum z s'
{-# INLINE [0] maximum #-}
-- | /O(n)/ minimum returns the minimum value from a 'Text', which must be
-- non-empty.
minimum :: Stream Char -> Char
minimum (Stream next0 s0 _len) = loop0_minimum s0
where
loop0_minimum !s = case next0 s of
Done -> emptyError "minimum"
Skip s' -> loop0_minimum s'
Yield x s' -> loop_minimum x s'
loop_minimum !z !s = case next0 s of
Done -> z
Skip s' -> loop_minimum z s'
Yield x s'
| x < z -> loop_minimum x s'
| otherwise -> loop_minimum z s'
{-# INLINE [0] minimum #-}
-- -----------------------------------------------------------------------------
-- * Building streams
scanl :: (Char -> Char -> Char) -> Char -> Stream Char -> Stream Char
scanl f z0 (Stream next0 s0 len) = Stream next (Scan1 z0 s0) (len+1) -- HINT maybe too low
where
{-# INLINE next #-}
next (Scan1 z s) = Yield z (Scan2 z s)
next (Scan2 z s) = case next0 s of
Yield x s' -> let !x' = f z x
in Yield x' (Scan2 x' s')
Skip s' -> Skip (Scan2 z s')
Done -> Done
{-# INLINE [0] scanl #-}
-- -----------------------------------------------------------------------------
-- ** Generating and unfolding streams
replicateCharI :: Integral a => a -> Char -> Stream Char
replicateCharI !n !c
| n < 0 = empty
| otherwise = Stream next 0 (fromIntegral n) -- HINT maybe too low
where
next !i | i >= n = Done
| otherwise = Yield c (i + 1)
{-# INLINE [0] replicateCharI #-}
data RI s = RI !s {-# UNPACK #-} !Int64
replicateI :: Int64 -> Stream Char -> Stream Char
replicateI n (Stream next0 s0 len) =
Stream next (RI s0 0) (fromIntegral (max 0 n) * len)
where
next (RI s k)
| k >= n = Done
| otherwise = case next0 s of
Done -> Skip (RI s0 (k+1))
Skip s' -> Skip (RI s' k)
Yield x s' -> Yield x (RI s' k)
{-# INLINE [0] replicateI #-}
-- | /O(n)/, where @n@ is the length of the result. The unfoldr function
-- is analogous to the List 'unfoldr'. unfoldr builds a stream
-- from a seed value. The function takes the element and returns
-- Nothing if it is done producing the stream or returns Just
-- (a,b), in which case, a is the next Char in the string, and b is
-- the seed value for further production.
unfoldr :: (a -> Maybe (Char,a)) -> a -> Stream Char
unfoldr f s0 = Stream next s0 unknownSize
where
{-# INLINE next #-}
next !s = case f s of
Nothing -> Done
Just (w, s') -> Yield w s'
{-# INLINE [0] unfoldr #-}
-- | /O(n)/ Like 'unfoldr', 'unfoldrNI' builds a stream from a seed
-- value. However, the length of the result is limited by the
-- first argument to 'unfoldrNI'. This function is more efficient than
-- 'unfoldr' when the length of the result is known.
unfoldrNI :: Integral a => a -> (b -> Maybe (Char,b)) -> b -> Stream Char
unfoldrNI n f s0 | n < 0 = empty
| otherwise = Stream next (0 :*: s0) (maxSize $ fromIntegral (n*2))
where
{-# INLINE next #-}
next (z :*: s) = case f s of
Nothing -> Done
Just (w, s') | z >= n -> Done
| otherwise -> Yield w ((z + 1) :*: s')
{-# INLINE unfoldrNI #-}
-------------------------------------------------------------------------------
-- * Substreams
-- | /O(n)/ @'take' n@, applied to a stream, returns the prefix of the
-- stream of length @n@, or the stream itself if @n@ is greater than the
-- length of the stream.
take :: Integral a => a -> Stream Char -> Stream Char
take n0 (Stream next0 s0 len) =
Stream next (n0' :*: s0) (smaller len (codePointsSize $ fromIntegral n0'))
where
n0' = max n0 0
{-# INLINE next #-}
next (n :*: s) | n <= 0 = Done
| otherwise = case next0 s of
Done -> Done
Skip s' -> Skip (n :*: s')
Yield x s' -> Yield x ((n-1) :*: s')
{-# INLINE [0] take #-}
data Drop a s = NS !s
| JS !a !s
-- | /O(n)/ @'drop' n@, applied to a stream, returns the suffix of the
-- stream after the first @n@ characters, or the empty stream if @n@
-- is greater than the length of the stream.
drop :: Integral a => a -> Stream Char -> Stream Char
drop n0 (Stream next0 s0 len) =
Stream next (JS n0' s0) (len - codePointsSize (fromIntegral n0'))
where
n0' = max n0 0
{-# INLINE next #-}
next (JS n s)
| n <= 0 = Skip (NS s)
| otherwise = case next0 s of
Done -> Done
Skip s' -> Skip (JS n s')
Yield _ s' -> Skip (JS (n-1) s')
next (NS s) = case next0 s of
Done -> Done
Skip s' -> Skip (NS s')
Yield x s' -> Yield x (NS s')
{-# INLINE [0] drop #-}
-- | 'takeWhile', applied to a predicate @p@ and a stream, returns the
-- longest prefix (possibly empty) of elements that satisfy @p@.
takeWhile :: (Char -> Bool) -> Stream Char -> Stream Char
takeWhile p (Stream next0 s0 len) = Stream next s0 (len - unknownSize)
where
{-# INLINE next #-}
next !s = case next0 s of
Done -> Done
Skip s' -> Skip s'
Yield x s' | p x -> Yield x s'
| otherwise -> Done
{-# INLINE [0] takeWhile #-}
-- | @'dropWhile' p xs@ returns the suffix remaining after @'takeWhile' p xs@.
dropWhile :: (Char -> Bool) -> Stream Char -> Stream Char
dropWhile p (Stream next0 s0 len) = Stream next (L s0) (len - unknownSize)
where
{-# INLINE next #-}
next (L s) = case next0 s of
Done -> Done
Skip s' -> Skip (L s')
Yield x s' | p x -> Skip (L s')
| otherwise -> Yield x (R s')
next (R s) = case next0 s of
Done -> Done
Skip s' -> Skip (R s')
Yield x s' -> Yield x (R s')
{-# INLINE [0] dropWhile #-}
-- | /O(n)/ The 'isPrefixOf' function takes two 'Stream's and returns
-- 'True' iff the first is a prefix of the second.
isPrefixOf :: (Eq a) => Stream a -> Stream a -> Bool
isPrefixOf (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2)
where
loop Done _ = True
loop _ Done = False
loop (Skip s1') (Skip s2') = loop (next1 s1') (next2 s2')
loop (Skip s1') x2 = loop (next1 s1') x2
loop x1 (Skip s2') = loop x1 (next2 s2')
loop (Yield x1 s1') (Yield x2 s2') = x1 == x2 &&
loop (next1 s1') (next2 s2')
{-# INLINE [0] isPrefixOf #-}
-- ----------------------------------------------------------------------------
-- * Searching
-------------------------------------------------------------------------------
-- ** Searching by equality
-- | /O(n)/ 'elem' is the stream membership predicate.
elem :: Char -> Stream Char -> Bool
elem w (Stream next s0 _len) = loop_elem s0
where
loop_elem !s = case next s of
Done -> False
Skip s' -> loop_elem s'
Yield x s' | x == w -> True
| otherwise -> loop_elem s'
{-# INLINE [0] elem #-}
-------------------------------------------------------------------------------
-- ** Searching with a predicate
-- | /O(n)/ The 'findBy' function takes a predicate and a stream,
-- and returns the first element in matching the predicate, or 'Nothing'
-- if there is no such element.
findBy :: (Char -> Bool) -> Stream Char -> Maybe Char
findBy p (Stream next s0 _len) = loop_find s0
where
loop_find !s = case next s of
Done -> Nothing
Skip s' -> loop_find s'
Yield x s' | p x -> Just x
| otherwise -> loop_find s'
{-# INLINE [0] findBy #-}
-- | /O(n)/ Stream index (subscript) operator, starting from 0.
indexI :: Integral a => Stream Char -> a -> Char
indexI (Stream next s0 _len) n0
| n0 < 0 = streamError "index" "Negative index"
| otherwise = loop_index n0 s0
where
loop_index !n !s = case next s of
Done -> streamError "index" "Index too large"
Skip s' -> loop_index n s'
Yield x s' | n == 0 -> x
| otherwise -> loop_index (n-1) s'
{-# INLINE [0] indexI #-}
-- | /O(n)/ 'filter', applied to a predicate and a stream,
-- returns a stream containing those characters that satisfy the
-- predicate.
filter :: (Char -> Bool) -> Stream Char -> Stream Char
filter p (Stream next0 s0 len) =
Stream next s0 (len - unknownSize) -- HINT maybe too high
where
next !s = case next0 s of
Done -> Done
Skip s' -> Skip s'
Yield x s' | p x -> Yield x s'
| otherwise -> Skip s'
{-# INLINE [0] filter #-}
{-# RULES
"STREAM filter/filter fusion" forall p q s.
filter p (filter q s) = filter (\x -> q x && p x) s
#-}
-- | The 'findIndexI' function takes a predicate and a stream and
-- returns the index of the first element in the stream satisfying the
-- predicate.
findIndexI :: Integral a => (Char -> Bool) -> Stream Char -> Maybe a
findIndexI p s = case findIndicesI p s of
(i:_) -> Just i
_ -> Nothing
{-# INLINE [0] findIndexI #-}
-- | The 'findIndicesI' function takes a predicate and a stream and
-- returns all indices of the elements in the stream satisfying the
-- predicate.
findIndicesI :: Integral a => (Char -> Bool) -> Stream Char -> [a]
findIndicesI p (Stream next s0 _len) = loop_findIndex 0 s0
where
loop_findIndex !i !s = case next s of
Done -> []
Skip s' -> loop_findIndex i s' -- hmm. not caught by QC
Yield x s' | p x -> i : loop_findIndex (i+1) s'
| otherwise -> loop_findIndex (i+1) s'
{-# INLINE [0] findIndicesI #-}
-------------------------------------------------------------------------------
-- * Zipping
-- | Strict triple.
data Zip a b m = Z1 !a !b
| Z2 !a !b !m
-- | zipWith generalises 'zip' by zipping with the function given as
-- the first argument, instead of a tupling function.
zipWith :: (a -> a -> b) -> Stream a -> Stream a -> Stream b
zipWith f (Stream next0 sa0 len1) (Stream next1 sb0 len2) =
Stream next (Z1 sa0 sb0) (smaller len1 len2)
where
next (Z1 sa sb) = case next0 sa of
Done -> Done
Skip sa' -> Skip (Z1 sa' sb)
Yield a sa' -> Skip (Z2 sa' sb a)
next (Z2 sa' sb a) = case next1 sb of
Done -> Done
Skip sb' -> Skip (Z2 sa' sb' a)
Yield b sb' -> Yield (f a b) (Z1 sa' sb')
{-# INLINE [0] zipWith #-}
-- | /O(n)/ The 'countCharI' function returns the number of times the
-- query element appears in the given stream.
countCharI :: Integral a => Char -> Stream Char -> a
countCharI a (Stream next s0 _len) = loop 0 s0
where
loop !i !s = case next s of
Done -> i
Skip s' -> loop i s'
Yield x s' | a == x -> loop (i+1) s'
| otherwise -> loop i s'
{-# INLINE [0] countCharI #-}
streamError :: String -> String -> a
streamError func msg = P.error $ "Data.Text.Internal.Fusion.Common." ++ func ++ ": " ++ msg
emptyError :: String -> a
emptyError func = internalError func "Empty input"
internalError :: String -> a
internalError func = streamError func "Internal error"

View File

@ -1,187 +0,0 @@
{-# LANGUAGE CPP #-}
{-# OPTIONS_GHC -fno-warn-missing-methods #-}
-- |
-- Module : Data.Text.Internal.Fusion.Internal
-- Copyright : (c) Roman Leshchinskiy 2008,
-- (c) Bryan O'Sullivan 2009
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : portable
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Size hints.
module Data.Text.Internal.Fusion.Size
(
Size
-- * Sizes
, exactSize
, maxSize
, betweenSize
, unknownSize
, unionSize
, charSize
, codePointsSize
-- * Querying sizes
, exactly
, smaller
, larger
, upperBound
, lowerBound
, compareSize
, isEmpty
) where
import Data.Char (ord)
import Data.Text.Internal (mul)
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
-- | A size in UTF-16 code units.
data Size = Between {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ Lower and upper bounds on size.
| Unknown -- ^ Unknown size.
deriving (Eq, Show)
exactly :: Size -> Maybe Int
exactly (Between na nb) | na == nb = Just na
exactly _ = Nothing
{-# INLINE exactly #-}
-- | The 'Size' of the given code point.
charSize :: Char -> Size
charSize c
| ord c < 0x10000 = exactSize 1
| otherwise = exactSize 2
-- | The 'Size' of @n@ code points.
codePointsSize :: Int -> Size
codePointsSize n =
#if defined(ASSERTS)
assert (n >= 0)
#endif
Between n (2*n)
{-# INLINE codePointsSize #-}
exactSize :: Int -> Size
exactSize n =
#if defined(ASSERTS)
assert (n >= 0)
#endif
Between n n
{-# INLINE exactSize #-}
maxSize :: Int -> Size
maxSize n =
#if defined(ASSERTS)
assert (n >= 0)
#endif
Between 0 n
{-# INLINE maxSize #-}
betweenSize :: Int -> Int -> Size
betweenSize m n =
#if defined(ASSERTS)
assert (m >= 0)
assert (n >= m)
#endif
Between m n
{-# INLINE betweenSize #-}
unionSize :: Size -> Size -> Size
unionSize (Between a b) (Between c d) = Between (min a c) (max b d)
unionSize _ _ = Unknown
unknownSize :: Size
unknownSize = Unknown
{-# INLINE unknownSize #-}
instance Num Size where
(+) = addSize
(-) = subtractSize
(*) = mulSize
fromInteger = f where f = exactSize . fromInteger
{-# INLINE f #-}
add :: Int -> Int -> Int
add m n | mn >= 0 = mn
| otherwise = overflowError
where mn = m + n
{-# INLINE add #-}
addSize :: Size -> Size -> Size
addSize (Between ma mb) (Between na nb) = Between (add ma na) (add mb nb)
addSize _ _ = Unknown
{-# INLINE addSize #-}
subtractSize :: Size -> Size -> Size
subtractSize (Between ma mb) (Between na nb) = Between (max (ma-nb) 0) (max (mb-na) 0)
subtractSize a@(Between 0 _) Unknown = a
subtractSize (Between _ mb) Unknown = Between 0 mb
subtractSize _ _ = Unknown
{-# INLINE subtractSize #-}
mulSize :: Size -> Size -> Size
mulSize (Between ma mb) (Between na nb) = Between (mul ma na) (mul mb nb)
mulSize _ _ = Unknown
{-# INLINE mulSize #-}
-- | Minimum of two size hints.
smaller :: Size -> Size -> Size
smaller a@(Between ma mb) b@(Between na nb)
| mb <= na = a
| nb <= ma = b
| otherwise = Between (ma `min` na) (mb `min` nb)
smaller a@(Between 0 _) Unknown = a
smaller (Between _ mb) Unknown = Between 0 mb
smaller Unknown b@(Between 0 _) = b
smaller Unknown (Between _ nb) = Between 0 nb
smaller Unknown Unknown = Unknown
{-# INLINE smaller #-}
-- | Maximum of two size hints.
larger :: Size -> Size -> Size
larger a@(Between ma mb) b@(Between na nb)
| ma >= nb = a
| na >= mb = b
| otherwise = Between (ma `max` na) (mb `max` nb)
larger _ _ = Unknown
{-# INLINE larger #-}
-- | Compute the maximum size from a size hint, if possible.
upperBound :: Int -> Size -> Int
upperBound _ (Between _ n) = n
upperBound k _ = k
{-# INLINE upperBound #-}
-- | Compute the maximum size from a size hint, if possible.
lowerBound :: Int -> Size -> Int
lowerBound _ (Between n _) = n
lowerBound k _ = k
{-# INLINE lowerBound #-}
-- | Determine the ordering relationship between two 'Size's, or 'Nothing' in
-- the indeterminate case.
compareSize :: Size -> Size -> Maybe Ordering
compareSize (Between ma mb) (Between na nb)
| mb < na = Just LT
| ma > nb = Just GT
| ma == mb
, ma == na
, ma == nb = Just EQ
compareSize _ _ = Nothing
isEmpty :: Size -> Bool
isEmpty (Between _ n) = n <= 0
isEmpty _ = False
{-# INLINE isEmpty #-}
overflowError :: Int
overflowError = error "Data.Text.Internal.Fusion.Size: size overflow"

View File

@ -1,122 +0,0 @@
{-# LANGUAGE BangPatterns, ExistentialQuantification #-}
-- |
-- Module : Data.Text.Internal.Fusion.Types
-- Copyright : (c) Tom Harper 2008-2009,
-- (c) Bryan O'Sullivan 2009,
-- (c) Duncan Coutts 2009,
-- (c) Jasper Van der Jeugt 2011
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Core stream fusion functionality for text.
module Data.Text.Internal.Fusion.Types
(
CC(..)
, PairS(..)
, Scan(..)
, RS(..)
, Step(..)
, Stream(..)
, empty
) where
import Data.Text.Internal.Fusion.Size
import Data.Word (Word8)
-- | Specialised tuple for case conversion.
data CC s = CC !s {-# UNPACK #-} !Char {-# UNPACK #-} !Char
-- | Restreaming state.
data RS s
= RS0 !s
| RS1 !s {-# UNPACK #-} !Word8
| RS2 !s {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
| RS3 !s {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
-- | Strict pair.
data PairS a b = !a :*: !b
-- deriving (Eq, Ord, Show)
infixl 2 :*:
-- | An intermediate result in a scan.
data Scan s = Scan1 {-# UNPACK #-} !Char !s
| Scan2 {-# UNPACK #-} !Char !s
-- | Intermediate result in a processing pipeline.
data Step s a = Done
| Skip !s
| Yield !a !s
{-
instance (Show a) => Show (Step s a)
where show Done = "Done"
show (Skip _) = "Skip"
show (Yield x _) = "Yield " ++ show x
-}
instance (Eq a) => Eq (Stream a) where
(==) = eq
instance (Ord a) => Ord (Stream a) where
compare = cmp
-- The length hint in a Stream has two roles. If its value is zero,
-- we trust it, and treat the stream as empty. Otherwise, we treat it
-- as a hint: it should usually be accurate, so we use it when
-- unstreaming to decide what size array to allocate. However, the
-- unstreaming functions must be able to cope with the hint being too
-- small or too large.
--
-- The size hint tries to track the UTF-16 code units in a stream,
-- but often counts the number of code points instead. It can easily
-- undercount if, for instance, a transformed stream contains astral
-- plane code points (those above 0x10000).
data Stream a =
forall s. Stream
(s -> Step s a) -- stepper function
!s -- current state
!Size -- size hint in code units
-- | /O(n)/ Determines if two streams are equal.
eq :: (Eq a) => Stream a -> Stream a -> Bool
eq (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2)
where
loop Done Done = True
loop (Skip s1') (Skip s2') = loop (next1 s1') (next2 s2')
loop (Skip s1') x2 = loop (next1 s1') x2
loop x1 (Skip s2') = loop x1 (next2 s2')
loop Done _ = False
loop _ Done = False
loop (Yield x1 s1') (Yield x2 s2') = x1 == x2 &&
loop (next1 s1') (next2 s2')
{-# INLINE [0] eq #-}
cmp :: (Ord a) => Stream a -> Stream a -> Ordering
cmp (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2)
where
loop Done Done = EQ
loop (Skip s1') (Skip s2') = loop (next1 s1') (next2 s2')
loop (Skip s1') x2 = loop (next1 s1') x2
loop x1 (Skip s2') = loop x1 (next2 s2')
loop Done _ = LT
loop _ Done = GT
loop (Yield x1 s1') (Yield x2 s2') =
case compare x1 x2 of
EQ -> loop (next1 s1') (next2 s2')
other -> other
{-# INLINE [0] cmp #-}
-- | The empty stream.
empty :: Stream a
empty = Stream next () 0
where next _ = Done
{-# INLINE [0] empty #-}

View File

@ -1,166 +0,0 @@
{-# LANGUAGE BangPatterns, RecordWildCards #-}
-- |
-- Module : Data.Text.Internal.IO
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan,
-- (c) 2009 Simon Marlow
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Low-level support for text I\/O.
module Data.Text.Internal.IO
(
hGetLineWith
, readChunk
) where
import qualified Control.Exception as E
import Data.IORef (readIORef, writeIORef)
import Data.Text (Text)
import Data.Text.Internal.Fusion (unstream)
import Data.Text.Internal.Fusion.Types (Step(..), Stream(..))
import Data.Text.Internal.Fusion.Size (exactSize, maxSize)
import Data.Text.Unsafe (inlinePerformIO)
import Foreign.Storable (peekElemOff)
import GHC.IO.Buffer (Buffer(..), CharBuffer, RawCharBuffer, bufferAdjustL,
bufferElems, charSize, isEmptyBuffer, readCharBuf,
withRawBuffer, writeCharBuf)
import GHC.IO.Handle.Internals (ioe_EOF, readTextDevice, wantReadableHandle_)
import GHC.IO.Handle.Types (Handle__(..), Newline(..))
import System.IO (Handle)
import System.IO.Error (isEOFError)
import qualified Data.Text as T
-- | Read a single line of input from a handle, constructing a list of
-- decoded chunks as we go. When we're done, transform them into the
-- destination type.
hGetLineWith :: ([Text] -> t) -> Handle -> IO t
hGetLineWith f h = wantReadableHandle_ "hGetLine" h go
where
go hh@Handle__{..} = readIORef haCharBuffer >>= fmap f . hGetLineLoop hh []
hGetLineLoop :: Handle__ -> [Text] -> CharBuffer -> IO [Text]
hGetLineLoop hh@Handle__{..} = go where
go ts buf@Buffer{ bufL=r0, bufR=w, bufRaw=raw0 } = do
let findEOL raw r | r == w = return (False, w)
| otherwise = do
(c,r') <- readCharBuf raw r
if c == '\n'
then return (True, r)
else findEOL raw r'
(eol, off) <- findEOL raw0 r0
(t,r') <- if haInputNL == CRLF
then unpack_nl raw0 r0 off
else do t <- unpack raw0 r0 off
return (t,off)
if eol
then do writeIORef haCharBuffer (bufferAdjustL (off+1) buf)
return $ reverse (t:ts)
else do
let buf1 = bufferAdjustL r' buf
maybe_buf <- maybeFillReadBuffer hh buf1
case maybe_buf of
-- Nothing indicates we caught an EOF, and we may have a
-- partial line to return.
Nothing -> do
-- we reached EOF. There might be a lone \r left
-- in the buffer, so check for that and
-- append it to the line if necessary.
let pre | isEmptyBuffer buf1 = T.empty
| otherwise = T.singleton '\r'
writeIORef haCharBuffer buf1{ bufL=0, bufR=0 }
let str = reverse . filter (not . T.null) $ pre:t:ts
if null str
then ioe_EOF
else return str
Just new_buf -> go (t:ts) new_buf
-- This function is lifted almost verbatim from GHC.IO.Handle.Text.
maybeFillReadBuffer :: Handle__ -> CharBuffer -> IO (Maybe CharBuffer)
maybeFillReadBuffer handle_ buf
= E.catch (Just `fmap` getSomeCharacters handle_ buf) $ \e ->
if isEOFError e
then return Nothing
else ioError e
unpack :: RawCharBuffer -> Int -> Int -> IO Text
unpack !buf !r !w
| charSize /= 4 = sizeError "unpack"
| r >= w = return T.empty
| otherwise = withRawBuffer buf go
where
go pbuf = return $! unstream (Stream next r (exactSize (w-r)))
where
next !i | i >= w = Done
| otherwise = Yield (ix i) (i+1)
ix i = inlinePerformIO $ peekElemOff pbuf i
unpack_nl :: RawCharBuffer -> Int -> Int -> IO (Text, Int)
unpack_nl !buf !r !w
| charSize /= 4 = sizeError "unpack_nl"
| r >= w = return (T.empty, 0)
| otherwise = withRawBuffer buf $ go
where
go pbuf = do
let !t = unstream (Stream next r (maxSize (w-r)))
w' = w - 1
return $ if ix w' == '\r'
then (t,w')
else (t,w)
where
next !i | i >= w = Done
| c == '\r' = let i' = i + 1
in if i' < w
then if ix i' == '\n'
then Yield '\n' (i+2)
else Yield '\n' i'
else Done
| otherwise = Yield c (i+1)
where c = ix i
ix i = inlinePerformIO $ peekElemOff pbuf i
-- This function is completely lifted from GHC.IO.Handle.Text.
getSomeCharacters :: Handle__ -> CharBuffer -> IO CharBuffer
getSomeCharacters handle_@Handle__{..} buf@Buffer{..} =
case bufferElems buf of
-- buffer empty: read some more
0 -> {-# SCC "readTextDevice" #-} readTextDevice handle_ buf
-- if the buffer has a single '\r' in it and we're doing newline
-- translation: read some more
1 | haInputNL == CRLF -> do
(c,_) <- readCharBuf bufRaw bufL
if c == '\r'
then do -- shuffle the '\r' to the beginning. This is only safe
-- if we're about to call readTextDevice, otherwise it
-- would mess up flushCharBuffer.
-- See [note Buffer Flushing], GHC.IO.Handle.Types
_ <- writeCharBuf bufRaw 0 '\r'
let buf' = buf{ bufL=0, bufR=1 }
readTextDevice handle_ buf'
else do
return buf
-- buffer has some chars in it already: just return it
_otherwise -> {-# SCC "otherwise" #-} return buf
-- | Read a single chunk of strict text from a buffer. Used by both
-- the strict and lazy implementations of hGetContents.
readChunk :: Handle__ -> CharBuffer -> IO Text
readChunk hh@Handle__{..} buf = do
buf'@Buffer{..} <- getSomeCharacters hh buf
(t,r) <- if haInputNL == CRLF
then unpack_nl bufRaw bufL bufR
else do t <- unpack bufRaw bufL bufR
return (t,bufR)
writeIORef haCharBuffer (bufferAdjustL r buf')
return t
sizeError :: String -> a
sizeError loc = error $ "Data.Text.IO." ++ loc ++ ": bad internal buffer size"

View File

@ -1,119 +0,0 @@
{-# LANGUAGE BangPatterns, DeriveDataTypeable #-}
{-# OPTIONS_HADDOCK not-home #-}
-- |
-- Module : Data.Text.Internal.Lazy
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- A module containing private 'Text' internals. This exposes the
-- 'Text' representation and low level construction functions.
-- Modules which extend the 'Text' system may need to use this module.
module Data.Text.Internal.Lazy
(
Text(..)
, chunk
, empty
, foldrChunks
, foldlChunks
-- * Data type invariant and abstraction functions
-- $invariant
, strictInvariant
, lazyInvariant
, showStructure
-- * Chunk allocation sizes
, defaultChunkSize
, smallChunkSize
, chunkOverhead
) where
import Data.Text ()
import Data.Text.Internal.Unsafe.Shift (shiftL)
import Data.Typeable (Typeable)
import Foreign.Storable (sizeOf)
import qualified Data.Text.Internal as T
data Text = Empty
| Chunk {-# UNPACK #-} !T.Text Text
deriving (Typeable)
-- $invariant
--
-- The data type invariant for lazy 'Text': Every 'Text' is either 'Empty' or
-- consists of non-null 'T.Text's. All functions must preserve this,
-- and the QC properties must check this.
-- | Check the invariant strictly.
strictInvariant :: Text -> Bool
strictInvariant Empty = True
strictInvariant x@(Chunk (T.Text _ _ len) cs)
| len > 0 = strictInvariant cs
| otherwise = error $ "Data.Text.Lazy: invariant violation: "
++ showStructure x
-- | Check the invariant lazily.
lazyInvariant :: Text -> Text
lazyInvariant Empty = Empty
lazyInvariant x@(Chunk c@(T.Text _ _ len) cs)
| len > 0 = Chunk c (lazyInvariant cs)
| otherwise = error $ "Data.Text.Lazy: invariant violation: "
++ showStructure x
-- | Display the internal structure of a lazy 'Text'.
showStructure :: Text -> String
showStructure Empty = "Empty"
showStructure (Chunk t Empty) = "Chunk " ++ show t ++ " Empty"
showStructure (Chunk t ts) =
"Chunk " ++ show t ++ " (" ++ showStructure ts ++ ")"
-- | Smart constructor for 'Chunk'. Guarantees the data type invariant.
chunk :: T.Text -> Text -> Text
{-# INLINE chunk #-}
chunk t@(T.Text _ _ len) ts | len == 0 = ts
| otherwise = Chunk t ts
-- | Smart constructor for 'Empty'.
empty :: Text
{-# INLINE [0] empty #-}
empty = Empty
-- | Consume the chunks of a lazy 'Text' with a natural right fold.
foldrChunks :: (T.Text -> a -> a) -> a -> Text -> a
foldrChunks f z = go
where go Empty = z
go (Chunk c cs) = f c (go cs)
{-# INLINE foldrChunks #-}
-- | Consume the chunks of a lazy 'Text' with a strict, tail-recursive,
-- accumulating left fold.
foldlChunks :: (a -> T.Text -> a) -> a -> Text -> a
foldlChunks f z = go z
where go !a Empty = a
go !a (Chunk c cs) = go (f a c) cs
{-# INLINE foldlChunks #-}
-- | Currently set to 16 KiB, less the memory management overhead.
defaultChunkSize :: Int
defaultChunkSize = 16384 - chunkOverhead
{-# INLINE defaultChunkSize #-}
-- | Currently set to 128 bytes, less the memory management overhead.
smallChunkSize :: Int
smallChunkSize = 128 - chunkOverhead
{-# INLINE smallChunkSize #-}
-- | The memory management overhead. Currently this is tuned for GHC only.
chunkOverhead :: Int
chunkOverhead = sizeOf (undefined :: Int) `shiftL` 1
{-# INLINE chunkOverhead #-}

View File

@ -1,324 +0,0 @@
{-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
-- |
-- Module : Data.Text.Lazy.Encoding.Fusion
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : portable
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Fusible 'Stream'-oriented functions for converting between lazy
-- 'Text' and several common encodings.
module Data.Text.Internal.Lazy.Encoding.Fusion
(
-- * Streaming
-- streamASCII
streamUtf8
, streamUtf16LE
, streamUtf16BE
, streamUtf32LE
, streamUtf32BE
-- * Unstreaming
, unstream
, module Data.Text.Internal.Encoding.Fusion.Common
) where
import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
import qualified Data.ByteString as B
import qualified Data.ByteString.Unsafe as B
import Data.Text.Internal.Encoding.Fusion.Common
import Data.Text.Encoding.Error
import Data.Text.Internal.Fusion (Step(..), Stream(..))
import Data.Text.Internal.Fusion.Size
import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeChr8, unsafeChr32)
import Data.Text.Internal.Unsafe.Shift (shiftL)
import Data.Word (Word8, Word16, Word32)
import qualified Data.Text.Internal.Encoding.Utf8 as U8
import qualified Data.Text.Internal.Encoding.Utf16 as U16
import qualified Data.Text.Internal.Encoding.Utf32 as U32
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
import Foreign.Storable (pokeByteOff)
import Data.ByteString.Internal (mallocByteString, memcpy)
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
import qualified Data.ByteString.Internal as B
data S = S0
| S1 {-# UNPACK #-} !Word8
| S2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
| S3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
| S4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
data T = T !ByteString !S {-# UNPACK #-} !Int
-- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
-- UTF-8 encoding.
streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
streamUtf8 onErr bs0 = Stream next (T bs0 S0 0) unknownSize
where
next (T bs@(Chunk ps _) S0 i)
| i < len && U8.validate1 a =
Yield (unsafeChr8 a) (T bs S0 (i+1))
| i + 1 < len && U8.validate2 a b =
Yield (U8.chr2 a b) (T bs S0 (i+2))
| i + 2 < len && U8.validate3 a b c =
Yield (U8.chr3 a b c) (T bs S0 (i+3))
| i + 3 < len && U8.validate4 a b c d =
Yield (U8.chr4 a b c d) (T bs S0 (i+4))
where len = B.length ps
a = B.unsafeIndex ps i
b = B.unsafeIndex ps (i+1)
c = B.unsafeIndex ps (i+2)
d = B.unsafeIndex ps (i+3)
next st@(T bs s i) =
case s of
S1 a | U8.validate1 a -> Yield (unsafeChr8 a) es
S2 a b | U8.validate2 a b -> Yield (U8.chr2 a b) es
S3 a b c | U8.validate3 a b c -> Yield (U8.chr3 a b c) es
S4 a b c d | U8.validate4 a b c d -> Yield (U8.chr4 a b c d) es
_ -> consume st
where es = T bs S0 i
consume (T bs@(Chunk ps rest) s i)
| i >= B.length ps = consume (T rest s 0)
| otherwise =
case s of
S0 -> next (T bs (S1 x) (i+1))
S1 a -> next (T bs (S2 a x) (i+1))
S2 a b -> next (T bs (S3 a b x) (i+1))
S3 a b c -> next (T bs (S4 a b c x) (i+1))
S4 a b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a)
(T bs (S3 b c d) (i+1))
where x = B.unsafeIndex ps i
consume (T Empty S0 _) = Done
consume st = decodeError "streamUtf8" "UTF-8" onErr Nothing st
{-# INLINE [0] streamUtf8 #-}
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
-- endian UTF-16 encoding.
streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
streamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
where
next (T bs@(Chunk ps _) S0 i)
| i + 1 < len && U16.validate1 x1 =
Yield (unsafeChr x1) (T bs S0 (i+2))
| i + 3 < len && U16.validate2 x1 x2 =
Yield (U16.chr2 x1 x2) (T bs S0 (i+4))
where len = B.length ps
x1 = c (idx i) (idx (i + 1))
x2 = c (idx (i + 2)) (idx (i + 3))
c w1 w2 = w1 + (w2 `shiftL` 8)
idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16
next st@(T bs s i) =
case s of
S2 w1 w2 | U16.validate1 (c w1 w2) ->
Yield (unsafeChr (c w1 w2)) es
S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
_ -> consume st
where es = T bs S0 i
c :: Word8 -> Word8 -> Word16
c w1 w2 = fromIntegral w1 + (fromIntegral w2 `shiftL` 8)
consume (T bs@(Chunk ps rest) s i)
| i >= B.length ps = consume (T rest s 0)
| otherwise =
case s of
S0 -> next (T bs (S1 x) (i+1))
S1 w1 -> next (T bs (S2 w1 x) (i+1))
S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
S4 w1 w2 w3 w4 -> decodeError "streamUtf16LE" "UTF-16LE" onErr (Just w1)
(T bs (S3 w2 w3 w4) (i+1))
where x = B.unsafeIndex ps i
consume (T Empty S0 _) = Done
consume st = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing st
{-# INLINE [0] streamUtf16LE #-}
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
-- endian UTF-16 encoding.
streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
streamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
where
next (T bs@(Chunk ps _) S0 i)
| i + 1 < len && U16.validate1 x1 =
Yield (unsafeChr x1) (T bs S0 (i+2))
| i + 3 < len && U16.validate2 x1 x2 =
Yield (U16.chr2 x1 x2) (T bs S0 (i+4))
where len = B.length ps
x1 = c (idx i) (idx (i + 1))
x2 = c (idx (i + 2)) (idx (i + 3))
c w1 w2 = (w1 `shiftL` 8) + w2
idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16
next st@(T bs s i) =
case s of
S2 w1 w2 | U16.validate1 (c w1 w2) ->
Yield (unsafeChr (c w1 w2)) es
S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
_ -> consume st
where es = T bs S0 i
c :: Word8 -> Word8 -> Word16
c w1 w2 = (fromIntegral w1 `shiftL` 8) + fromIntegral w2
consume (T bs@(Chunk ps rest) s i)
| i >= B.length ps = consume (T rest s 0)
| otherwise =
case s of
S0 -> next (T bs (S1 x) (i+1))
S1 w1 -> next (T bs (S2 w1 x) (i+1))
S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
S4 w1 w2 w3 w4 -> decodeError "streamUtf16BE" "UTF-16BE" onErr (Just w1)
(T bs (S3 w2 w3 w4) (i+1))
where x = B.unsafeIndex ps i
consume (T Empty S0 _) = Done
consume st = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing st
{-# INLINE [0] streamUtf16BE #-}
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
-- endian UTF-32 encoding.
streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
streamUtf32BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
where
next (T bs@(Chunk ps _) S0 i)
| i + 3 < len && U32.validate x =
Yield (unsafeChr32 x) (T bs S0 (i+4))
where len = B.length ps
x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
x1 = idx i
x2 = idx (i+1)
x3 = idx (i+2)
x4 = idx (i+3)
idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32
next st@(T bs s i) =
case s of
S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) ->
Yield (unsafeChr32 (c w1 w2 w3 w4)) es
_ -> consume st
where es = T bs S0 i
c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32
c w1 w2 w3 w4 = shifted
where
shifted = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
x1 = fromIntegral w1
x2 = fromIntegral w2
x3 = fromIntegral w3
x4 = fromIntegral w4
consume (T bs@(Chunk ps rest) s i)
| i >= B.length ps = consume (T rest s 0)
| otherwise =
case s of
S0 -> next (T bs (S1 x) (i+1))
S1 w1 -> next (T bs (S2 w1 x) (i+1))
S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
S4 w1 w2 w3 w4 -> decodeError "streamUtf32BE" "UTF-32BE" onErr (Just w1)
(T bs (S3 w2 w3 w4) (i+1))
where x = B.unsafeIndex ps i
consume (T Empty S0 _) = Done
consume st = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing st
{-# INLINE [0] streamUtf32BE #-}
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
-- endian UTF-32 encoding.
streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
streamUtf32LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
where
next (T bs@(Chunk ps _) S0 i)
| i + 3 < len && U32.validate x =
Yield (unsafeChr32 x) (T bs S0 (i+4))
where len = B.length ps
x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
x1 = idx i
x2 = idx (i+1)
x3 = idx (i+2)
x4 = idx (i+3)
idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32
next st@(T bs s i) =
case s of
S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) ->
Yield (unsafeChr32 (c w1 w2 w3 w4)) es
_ -> consume st
where es = T bs S0 i
c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32
c w1 w2 w3 w4 = shifted
where
shifted = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
x1 = fromIntegral w1
x2 = fromIntegral w2
x3 = fromIntegral w3
x4 = fromIntegral w4
consume (T bs@(Chunk ps rest) s i)
| i >= B.length ps = consume (T rest s 0)
| otherwise =
case s of
S0 -> next (T bs (S1 x) (i+1))
S1 w1 -> next (T bs (S2 w1 x) (i+1))
S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
S4 w1 w2 w3 w4 -> decodeError "streamUtf32LE" "UTF-32LE" onErr (Just w1)
(T bs (S3 w2 w3 w4) (i+1))
where x = B.unsafeIndex ps i
consume (T Empty S0 _) = Done
consume st = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing st
{-# INLINE [0] streamUtf32LE #-}
-- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
unstreamChunks :: Int -> Stream Word8 -> ByteString
unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)
where chunk s1 len1 = unsafeDupablePerformIO $ do
let len = max 4 (min len1 chunkSize)
mallocByteString len >>= loop len 0 s1
where
loop !n !off !s fp = case next s of
Done | off == 0 -> return Empty
| otherwise -> return $! Chunk (trimUp fp off) Empty
Skip s' -> loop n off s' fp
Yield x s'
| off == chunkSize -> do
let !newLen = n - off
return $! Chunk (trimUp fp off) (chunk s newLen)
| off == n -> realloc fp n off s' x
| otherwise -> do
withForeignPtr fp $ \p -> pokeByteOff p off x
loop n (off+1) s' fp
{-# NOINLINE realloc #-}
realloc fp n off s x = do
let n' = min (n+n) chunkSize
fp' <- copy0 fp n n'
withForeignPtr fp' $ \p -> pokeByteOff p off x
loop n' (off+1) s fp'
trimUp fp off = B.PS fp 0 off
copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
copy0 !src !srcLen !destLen =
#if defined(ASSERTS)
assert (srcLen <= destLen) $
#endif
do
dest <- mallocByteString destLen
withForeignPtr src $ \src' ->
withForeignPtr dest $ \dest' ->
memcpy dest' src' (fromIntegral srcLen)
return dest
-- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
unstream :: Stream Word8 -> ByteString
unstream = unstreamChunks defaultChunkSize
decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
-> s -> Step s Char
decodeError func kind onErr mb i =
case onErr desc mb of
Nothing -> Skip i
Just c -> Yield c i
where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++
kind ++ " stream"

View File

@ -1,120 +0,0 @@
{-# LANGUAGE BangPatterns #-}
-- |
-- Module : Data.Text.Lazy.Fusion
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Core stream fusion functionality for text.
module Data.Text.Internal.Lazy.Fusion
(
stream
, unstream
, unstreamChunks
, length
, unfoldrN
, index
, countChar
) where
import Prelude hiding (length)
import qualified Data.Text.Internal.Fusion.Common as S
import Control.Monad.ST (runST)
import Data.Text.Internal.Fusion.Types
import Data.Text.Internal.Fusion.Size (isEmpty, unknownSize)
import Data.Text.Internal.Lazy
import qualified Data.Text.Internal as I
import qualified Data.Text.Array as A
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Internal.Unsafe.Shift (shiftL)
import Data.Text.Unsafe (Iter(..), iter)
import Data.Int (Int64)
default(Int64)
-- | /O(n)/ Convert a 'Text' into a 'Stream Char'.
stream :: Text -> Stream Char
stream text = Stream next (text :*: 0) unknownSize
where
next (Empty :*: _) = Done
next (txt@(Chunk t@(I.Text _ _ len) ts) :*: i)
| i >= len = next (ts :*: 0)
| otherwise = Yield c (txt :*: i+d)
where Iter c d = iter t i
{-# INLINE [0] stream #-}
-- | /O(n)/ Convert a 'Stream Char' into a 'Text', using the given
-- chunk size.
unstreamChunks :: Int -> Stream Char -> Text
unstreamChunks !chunkSize (Stream next s0 len0)
| isEmpty len0 = Empty
| otherwise = outer s0
where
outer so = {-# SCC "unstreamChunks/outer" #-}
case next so of
Done -> Empty
Skip s' -> outer s'
Yield x s' -> runST $ do
a <- A.new unknownLength
unsafeWrite a 0 x >>= inner a unknownLength s'
where unknownLength = 4
where
inner marr !len s !i
| i + 1 >= chunkSize = finish marr i s
| i + 1 >= len = {-# SCC "unstreamChunks/resize" #-} do
let newLen = min (len `shiftL` 1) chunkSize
marr' <- A.new newLen
A.copyM marr' 0 marr 0 len
inner marr' newLen s i
| otherwise =
{-# SCC "unstreamChunks/inner" #-}
case next s of
Done -> finish marr i s
Skip s' -> inner marr len s' i
Yield x s' -> do d <- unsafeWrite marr i x
inner marr len s' (i+d)
finish marr len s' = do
arr <- A.unsafeFreeze marr
return (I.Text arr 0 len `Chunk` outer s')
{-# INLINE [0] unstreamChunks #-}
-- | /O(n)/ Convert a 'Stream Char' into a 'Text', using
-- 'defaultChunkSize'.
unstream :: Stream Char -> Text
unstream = unstreamChunks defaultChunkSize
{-# INLINE [0] unstream #-}
-- | /O(n)/ Returns the number of characters in a text.
length :: Stream Char -> Int64
length = S.lengthI
{-# INLINE[0] length #-}
{-# RULES "LAZY STREAM stream/unstream fusion" forall s.
stream (unstream s) = s #-}
-- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a stream from a seed
-- value. However, the length of the result is limited by the
-- first argument to 'unfoldrN'. This function is more efficient than
-- 'unfoldr' when the length of the result is known.
unfoldrN :: Int64 -> (a -> Maybe (Char,a)) -> a -> Stream Char
unfoldrN n = S.unfoldrNI n
{-# INLINE [0] unfoldrN #-}
-- | /O(n)/ stream index (subscript) operator, starting from 0.
index :: Stream Char -> Int64 -> Char
index = S.indexI
{-# INLINE [0] index #-}
-- | /O(n)/ The 'count' function returns the number of times the query
-- element appears in the given stream.
countChar :: Char -> Stream Char -> Int64
countChar = S.countCharI
{-# INLINE [0] countChar #-}

View File

@ -1,134 +0,0 @@
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
-- |
-- Module : Data.Text.Lazy.Search
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Fast substring search for lazy 'Text', based on work by Boyer,
-- Moore, Horspool, Sunday, and Lundh. Adapted from the strict
-- implementation.
module Data.Text.Internal.Lazy.Search
(
indices
) where
import qualified Data.Text.Array as A
import Data.Int (Int64)
import Data.Word (Word16, Word64)
import qualified Data.Text.Internal as T
import Data.Text.Internal.Fusion.Types (PairS(..))
import Data.Text.Internal.Lazy (Text(..), foldlChunks)
import Data.Bits ((.|.), (.&.))
import Data.Text.Internal.Unsafe.Shift (shiftL)
-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
-- @needle@ within @haystack@.
--
-- This function is strict in @needle@, and lazy (as far as possible)
-- in the chunks of @haystack@.
--
-- In (unlikely) bad cases, this algorithm's complexity degrades
-- towards /O(n*m)/.
indices :: Text -- ^ Substring to search for (@needle@)
-> Text -- ^ Text to search in (@haystack@)
-> [Int64]
indices needle@(Chunk n ns) _haystack@(Chunk k ks)
| nlen <= 0 = []
| nlen == 1 = indicesOne (nindex 0) 0 k ks
| otherwise = advance k ks 0 0
where
advance x@(T.Text _ _ l) xs = scan
where
scan !g !i
| i >= m = case xs of
Empty -> []
Chunk y ys -> advance y ys g (i-m)
| lackingHay (i + nlen) x xs = []
| c == z && candidateMatch 0 = g : scan (g+nlen) (i+nlen)
| otherwise = scan (g+delta) (i+delta)
where
m = fromIntegral l
c = hindex (i + nlast)
delta | nextInPattern = nlen + 1
| c == z = skip + 1
| otherwise = 1
nextInPattern = mask .&. swizzle (hindex (i+nlen)) == 0
candidateMatch !j
| j >= nlast = True
| hindex (i+j) /= nindex j = False
| otherwise = candidateMatch (j+1)
hindex = index x xs
nlen = wordLength needle
nlast = nlen - 1
nindex = index n ns
z = foldlChunks fin 0 needle
where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)
(mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2)
swizzle w = 1 `shiftL` (fromIntegral w .&. 0x3f)
buildTable (T.Text xarr xoff xlen) xs = go
where
go !(g::Int64) !i !msk !skp
| i >= xlast = case xs of
Empty -> (msk .|. swizzle z) :*: skp
Chunk y ys -> buildTable y ys g 0 msk' skp'
| otherwise = go (g+1) (i+1) msk' skp'
where c = A.unsafeIndex xarr (xoff+i)
msk' = msk .|. swizzle c
skp' | c == z = nlen - g - 2
| otherwise = skp
xlast = xlen - 1
-- | Check whether an attempt to index into the haystack at the
-- given offset would fail.
lackingHay q = go 0
where
go p (T.Text _ _ l) ps = p' < q && case ps of
Empty -> True
Chunk r rs -> go p' r rs
where p' = p + fromIntegral l
indices _ _ = []
-- | Fast index into a partly unpacked 'Text'. We take into account
-- the possibility that the caller might try to access one element
-- past the end.
index :: T.Text -> Text -> Int64 -> Word16
index (T.Text arr off len) xs !i
| j < len = A.unsafeIndex arr (off+j)
| otherwise = case xs of
Empty
-- out of bounds, but legal
| j == len -> 0
-- should never happen, due to lackingHay above
| otherwise -> emptyError "index"
Chunk c cs -> index c cs (i-fromIntegral len)
where j = fromIntegral i
-- | A variant of 'indices' that scans linearly for a single 'Word16'.
indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64]
indicesOne c = chunk
where
chunk !i (T.Text oarr ooff olen) os = go 0
where
go h | h >= olen = case os of
Empty -> []
Chunk y ys -> chunk (i+fromIntegral olen) y ys
| on == c = i + fromIntegral h : go (h+1)
| otherwise = go (h+1)
where on = A.unsafeIndex oarr (ooff+h)
-- | The number of 'Word16' values in a 'Text'.
wordLength :: Text -> Int64
wordLength = foldlChunks sumLength 0
where sumLength i (T.Text _ _ l) = i + fromIntegral l
emptyError :: String -> a
emptyError fun = error ("Data.Text.Lazy.Search." ++ fun ++ ": empty input")

View File

@ -1,37 +0,0 @@
{-# LANGUAGE BangPatterns, Rank2Types, UnboxedTuples #-}
-- |
-- Module : Data.Text.Internal.Private
-- Copyright : (c) 2011 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
module Data.Text.Internal.Private
(
runText
, span_
) where
import Control.Monad.ST (ST, runST)
import Data.Text.Internal (Text(..), text)
import Data.Text.Unsafe (Iter(..), iter)
import qualified Data.Text.Array as A
span_ :: (Char -> Bool) -> Text -> (# Text, Text #)
span_ p t@(Text arr off len) = (# hd,tl #)
where hd = text arr off k
tl = text arr (off+k) (len-k)
!k = loop 0
loop !i | i < len && p c = loop (i+d)
| otherwise = i
where Iter c d = iter t i
{-# INLINE span_ #-}
runText :: (forall s. (A.MArray s -> Int -> ST s Text) -> ST s Text) -> Text
runText act = runST (act $ \ !marr !len -> do
arr <- A.unsafeFreeze marr
return $! text arr 0 len)
{-# INLINE runText #-}

View File

@ -1,69 +0,0 @@
-- |
-- Module : Data.Text.Internal.Read
-- Copyright : (c) 2014 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- Common internal functions for reading textual data.
module Data.Text.Internal.Read
(
IReader
, IParser(..)
, T(..)
, digitToInt
, hexDigitToInt
, perhaps
) where
import Control.Applicative as App (Applicative(..))
import Control.Arrow (first)
import Control.Monad (ap)
import Data.Char (ord)
type IReader t a = t -> Either String (a,t)
newtype IParser t a = P {
runP :: IReader t a
}
instance Functor (IParser t) where
fmap f m = P $ fmap (first f) . runP m
instance Applicative (IParser t) where
pure a = P $ \t -> Right (a,t)
{-# INLINE pure #-}
(<*>) = ap
instance Monad (IParser t) where
return = App.pure
m >>= k = P $ \t -> case runP m t of
Left err -> Left err
Right (a,t') -> runP (k a) t'
{-# INLINE (>>=) #-}
-- If we ever need a `MonadFail` instance the definition below can be used
--
-- > instance MonadFail (IParser t) where
-- > fail msg = P $ \_ -> Left msg
--
-- But given the code compiles fine with a post-MFP GHC 8.6+ we don't need
-- one just yet.
data T = T !Integer !Int
perhaps :: a -> IParser t a -> IParser t a
perhaps def m = P $ \t -> case runP m t of
Left _ -> Right (def,t)
r@(Right _) -> r
hexDigitToInt :: Char -> Int
hexDigitToInt c
| c >= '0' && c <= '9' = ord c - ord '0'
| c >= 'a' && c <= 'f' = ord c - (ord 'a' - 10)
| otherwise = ord c - (ord 'A' - 10)
digitToInt :: Char -> Int
digitToInt c = ord c - ord '0'

View File

@ -1,89 +0,0 @@
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
-- |
-- Module : Data.Text.Internal.Search
-- Copyright : (c) Bryan O'Sullivan 2009
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- Fast substring search for 'Text', based on work by Boyer, Moore,
-- Horspool, Sunday, and Lundh.
--
-- References:
--
-- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
-- Communications of the ACM, 20, 10, 762-772 (1977)
--
-- * R. N. Horspool: Practical Fast Searching in Strings. Software -
-- Practice and Experience 10, 501-506 (1980)
--
-- * D. M. Sunday: A Very Fast Substring Search Algorithm.
-- Communications of the ACM, 33, 8, 132-142 (1990)
--
-- * F. Lundh: The Fast Search Algorithm.
-- <http://effbot.org/zone/stringlib.htm> (2006)
module Data.Text.Internal.Search
(
indices
) where
import qualified Data.Text.Array as A
import Data.Word (Word64)
import Data.Text.Internal (Text(..))
import Data.Bits ((.|.), (.&.))
import Data.Text.Internal.Unsafe.Shift (shiftL)
data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int
-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
-- @needle@ within @haystack@. The offsets returned represent
-- uncorrected indices in the low-level \"needle\" array, to which its
-- offset must be added.
--
-- In (unlikely) bad cases, this algorithm's complexity degrades
-- towards /O(n*m)/.
indices :: Text -- ^ Substring to search for (@needle@)
-> Text -- ^ Text to search in (@haystack@)
-> [Int]
indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
| nlen == 1 = scanOne (nindex 0)
| nlen <= 0 || ldiff < 0 = []
| otherwise = scan 0
where
ldiff = hlen - nlen
nlast = nlen - 1
z = nindex nlast
nindex k = A.unsafeIndex narr (noff+k)
hindex k = A.unsafeIndex harr (hoff+k)
hindex' k | k == hlen = 0
| otherwise = A.unsafeIndex harr (hoff+k)
buildTable !i !msk !skp
| i >= nlast = (msk .|. swizzle z) :* skp
| otherwise = buildTable (i+1) (msk .|. swizzle c) skp'
where c = nindex i
skp' | c == z = nlen - i - 2
| otherwise = skp
swizzle k = 1 `shiftL` (fromIntegral k .&. 0x3f)
scan !i
| i > ldiff = []
| c == z && candidateMatch 0 = i : scan (i + nlen)
| otherwise = scan (i + delta)
where c = hindex (i + nlast)
candidateMatch !j
| j >= nlast = True
| hindex (i+j) /= nindex j = False
| otherwise = candidateMatch (j+1)
delta | nextInPattern = nlen + 1
| c == z = skip + 1
| otherwise = 1
where nextInPattern = mask .&. swizzle (hindex' (i+nlen)) == 0
!(mask :* skip) = buildTable 0 0 (nlen-2)
scanOne c = loop 0
where loop !i | i >= hlen = []
| hindex i == c = i : loop (i+1)
| otherwise = loop (i+1)
{-# INLINE indices #-}

View File

@ -1,56 +0,0 @@
{-# LANGUAGE CPP, MagicHash, UnboxedTuples #-}
{-# OPTIONS_HADDOCK not-home #-}
-- |
-- Module : Data.Text.Internal.Unsafe
-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : portable
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- A module containing /unsafe/ operations, for /very very careful/ use
-- in /heavily tested/ code.
module Data.Text.Internal.Unsafe
(
inlineInterleaveST
, inlinePerformIO
) where
import GHC.ST (ST(..))
#if defined(__GLASGOW_HASKELL__)
import GHC.IO (IO(IO))
import GHC.Base (realWorld#)
#endif
-- | Just like unsafePerformIO, but we inline it. Big performance gains as
-- it exposes lots of things to further inlining. /Very unsafe/. In
-- particular, you should do no memory allocation inside an
-- 'inlinePerformIO' block. On Hugs this is just @unsafePerformIO@.
--
{-# INLINE inlinePerformIO #-}
inlinePerformIO :: IO a -> a
#if defined(__GLASGOW_HASKELL__)
inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
#else
inlinePerformIO = unsafePerformIO
#endif
-- | Allow an 'ST' computation to be deferred lazily. When passed an
-- action of type 'ST' @s@ @a@, the action will only be performed when
-- the value of @a@ is demanded.
--
-- This function is identical to the normal unsafeInterleaveST, but is
-- inlined and hence faster.
--
-- /Note/: This operation is highly unsafe, as it can introduce
-- externally visible non-determinism into an 'ST' action.
inlineInterleaveST :: ST s a -> ST s a
inlineInterleaveST (ST m) = ST $ \ s ->
let r = case m s of (# _, res #) -> res in (# s, r #)
{-# INLINE inlineInterleaveST #-}

View File

@ -1,95 +0,0 @@
{-# LANGUAGE CPP, MagicHash #-}
-- |
-- Module : Data.Text.Internal.Unsafe.Char
-- Copyright : (c) 2008, 2009 Tom Harper,
-- (c) 2009, 2010 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Fast character manipulation functions.
module Data.Text.Internal.Unsafe.Char
(
ord
, unsafeChr
, unsafeChr8
, unsafeChr32
, unsafeWrite
-- , unsafeWriteRev
) where
#ifdef ASSERTS
import Control.Exception (assert)
#endif
import Control.Monad.ST (ST)
import Data.Bits ((.&.))
import Data.Text.Internal.Unsafe.Shift (shiftR)
import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
import GHC.Word (Word8(..), Word16(..), Word32(..))
import qualified Data.Text.Array as A
ord :: Char -> Int
ord (C# c#) = I# (ord# c#)
{-# INLINE ord #-}
unsafeChr :: Word16 -> Char
unsafeChr (W16# w#) = C# (chr# (word2Int# w#))
{-# INLINE unsafeChr #-}
unsafeChr8 :: Word8 -> Char
unsafeChr8 (W8# w#) = C# (chr# (word2Int# w#))
{-# INLINE unsafeChr8 #-}
unsafeChr32 :: Word32 -> Char
unsafeChr32 (W32# w#) = C# (chr# (word2Int# w#))
{-# INLINE unsafeChr32 #-}
-- | Write a character into the array at the given offset. Returns
-- the number of 'Word16's written.
unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
unsafeWrite marr i c
| n < 0x10000 = do
#if defined(ASSERTS)
assert (i >= 0) . assert (i < A.length marr) $ return ()
#endif
A.unsafeWrite marr i (fromIntegral n)
return 1
| otherwise = do
#if defined(ASSERTS)
assert (i >= 0) . assert (i < A.length marr - 1) $ return ()
#endif
A.unsafeWrite marr i lo
A.unsafeWrite marr (i+1) hi
return 2
where n = ord c
m = n - 0x10000
lo = fromIntegral $ (m `shiftR` 10) + 0xD800
hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
{-# INLINE unsafeWrite #-}
{-
unsafeWriteRev :: A.MArray s Word16 -> Int -> Char -> ST s Int
unsafeWriteRev marr i c
| n < 0x10000 = do
assert (i >= 0) . assert (i < A.length marr) $
A.unsafeWrite marr i (fromIntegral n)
return (i-1)
| otherwise = do
assert (i >= 1) . assert (i < A.length marr) $
A.unsafeWrite marr (i-1) lo
A.unsafeWrite marr i hi
return (i-2)
where n = ord c
m = n - 0x10000
lo = fromIntegral $ (m `shiftR` 10) + 0xD800
hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
{-# INLINE unsafeWriteRev #-}
-}

View File

@ -1,72 +0,0 @@
{-# LANGUAGE MagicHash #-}
-- |
-- Module : Data.Text.Internal.Unsafe.Shift
-- Copyright : (c) Bryan O'Sullivan 2009
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Fast, unchecked bit shifting functions.
module Data.Text.Internal.Unsafe.Shift
(
UnsafeShift(..)
) where
-- import qualified Data.Bits as Bits
import GHC.Base
import GHC.Word
-- | This is a workaround for poor optimisation in GHC 6.8.2. It
-- fails to notice constant-width shifts, and adds a test and branch
-- to every shift. This imposes about a 10% performance hit.
--
-- These functions are undefined when the amount being shifted by is
-- greater than the size in bits of a machine Int#.
class UnsafeShift a where
shiftL :: a -> Int -> a
shiftR :: a -> Int -> a
instance UnsafeShift Word16 where
{-# INLINE shiftL #-}
shiftL (W16# x#) (I# i#) = W16# (narrow16Word# (x# `uncheckedShiftL#` i#))
{-# INLINE shiftR #-}
shiftR (W16# x#) (I# i#) = W16# (x# `uncheckedShiftRL#` i#)
instance UnsafeShift Word32 where
{-# INLINE shiftL #-}
shiftL (W32# x#) (I# i#) = W32# (narrow32Word# (x# `uncheckedShiftL#` i#))
{-# INLINE shiftR #-}
shiftR (W32# x#) (I# i#) = W32# (x# `uncheckedShiftRL#` i#)
instance UnsafeShift Word64 where
{-# INLINE shiftL #-}
shiftL (W64# x#) (I# i#) = W64# (x# `uncheckedShiftL64#` i#)
{-# INLINE shiftR #-}
shiftR (W64# x#) (I# i#) = W64# (x# `uncheckedShiftRL64#` i#)
instance UnsafeShift Int where
{-# INLINE shiftL #-}
shiftL (I# x#) (I# i#) = I# (x# `iShiftL#` i#)
{-# INLINE shiftR #-}
shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
{-
instance UnsafeShift Integer where
{-# INLINE shiftL #-}
shiftL = Bits.shiftL
{-# INLINE shiftR #-}
shiftR = Bits.shiftR
-}

File diff suppressed because it is too large Load Diff

View File

@ -1,57 +0,0 @@
{-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-----------------------------------------------------------------------------
-- |
-- Module : Data.Text.Lazy.Builder
-- Copyright : (c) 2013 Bryan O'Sullivan
-- (c) 2010 Johan Tibell
-- License : BSD-style (see LICENSE)
--
-- Maintainer : Johan Tibell <johan.tibell@gmail.com>
-- Portability : portable to Hugs and GHC
--
-- Efficient construction of lazy @Text@ values. The principal
-- operations on a @Builder@ are @singleton@, @fromText@, and
-- @fromLazyText@, which construct new builders, and 'mappend', which
-- concatenates two builders.
--
-- To get maximum performance when building lazy @Text@ values using a
-- builder, associate @mappend@ calls to the right. For example,
-- prefer
--
-- > singleton 'a' `mappend` (singleton 'b' `mappend` singleton 'c')
--
-- to
--
-- > singleton 'a' `mappend` singleton 'b' `mappend` singleton 'c'
--
-- as the latter associates @mappend@ to the left. Or, equivalently,
-- prefer
--
-- > singleton 'a' <> singleton 'b' <> singleton 'c'
--
-- since the '<>' from recent versions of 'Data.Monoid' associates
-- to the right.
-----------------------------------------------------------------------------
module Data.Text.Lazy.Builder
( -- * The Builder type
Builder
, toLazyText
, toLazyTextWith
-- * Constructing Builders
, singleton
, fromText
, fromLazyText
, fromString
-- * Flushing the buffer state
, flush
) where
import Data.Text.Internal.Builder

View File

@ -1,264 +0,0 @@
{-# LANGUAGE BangPatterns, CPP, MagicHash, RankNTypes, ScopedTypeVariables,
UnboxedTuples #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- Module: Data.Text.Lazy.Builder.Int
-- Copyright: (c) 2013 Bryan O'Sullivan
-- (c) 2011 MailRank, Inc.
-- License: BSD-style
-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
-- Portability: portable
--
-- Efficiently write an integral value to a 'Builder'.
module Data.Text.Lazy.Builder.Int
(
decimal
, hexadecimal
) where
import Data.Int (Int8, Int16, Int32, Int64)
import Data.Monoid (mempty)
import qualified Data.ByteString.Unsafe as B
import Data.Text.Internal.Builder.Functions ((<>), i2d)
import Data.Text.Internal.Builder
import Data.Text.Internal.Builder.Int.Digits (digits)
import Data.Text.Array
import Data.Word (Word, Word8, Word16, Word32, Word64)
import GHC.Base (quotInt, remInt)
import GHC.Num (quotRemInteger)
import GHC.Types (Int(..))
import Control.Monad.ST
#if MIN_VERSION_base(4,11,0)
import Prelude hiding ((<>))
#endif
#ifdef __GLASGOW_HASKELL__
# if defined(INTEGER_GMP)
import GHC.Integer.GMP.Internals (Integer(S#))
# elif defined(INTEGER_SIMPLE)
import GHC.Integer
# else
# error "You need to use either GMP or integer-simple."
# endif
#endif
#if defined(INTEGER_GMP) || defined(INTEGER_SIMPLE)
# define PAIR(a,b) (# a,b #)
#else
# define PAIR(a,b) (a,b)
#endif
decimal :: Integral a => a -> Builder
{-# RULES "decimal/Int8" decimal = boundedDecimal :: Int8 -> Builder #-}
{-# RULES "decimal/Int" decimal = boundedDecimal :: Int -> Builder #-}
{-# RULES "decimal/Int16" decimal = boundedDecimal :: Int16 -> Builder #-}
{-# RULES "decimal/Int32" decimal = boundedDecimal :: Int32 -> Builder #-}
{-# RULES "decimal/Int64" decimal = boundedDecimal :: Int64 -> Builder #-}
{-# RULES "decimal/Word" decimal = positive :: Data.Word.Word -> Builder #-}
{-# RULES "decimal/Word8" decimal = positive :: Word8 -> Builder #-}
{-# RULES "decimal/Word16" decimal = positive :: Word16 -> Builder #-}
{-# RULES "decimal/Word32" decimal = positive :: Word32 -> Builder #-}
{-# RULES "decimal/Word64" decimal = positive :: Word64 -> Builder #-}
{-# RULES "decimal/Integer" decimal = integer 10 :: Integer -> Builder #-}
decimal i = decimal' (<= -128) i
{-# NOINLINE decimal #-}
boundedDecimal :: (Integral a, Bounded a) => a -> Builder
{-# SPECIALIZE boundedDecimal :: Int -> Builder #-}
{-# SPECIALIZE boundedDecimal :: Int8 -> Builder #-}
{-# SPECIALIZE boundedDecimal :: Int16 -> Builder #-}
{-# SPECIALIZE boundedDecimal :: Int32 -> Builder #-}
{-# SPECIALIZE boundedDecimal :: Int64 -> Builder #-}
boundedDecimal i = decimal' (== minBound) i
decimal' :: (Integral a) => (a -> Bool) -> a -> Builder
{-# INLINE decimal' #-}
decimal' p i
| i < 0 = if p i
then let (q, r) = i `quotRem` 10
qq = -q
!n = countDigits qq
in writeN (n + 2) $ \marr off -> do
unsafeWrite marr off minus
posDecimal marr (off+1) n qq
unsafeWrite marr (off+n+1) (i2w (-r))
else let j = -i
!n = countDigits j
in writeN (n + 1) $ \marr off ->
unsafeWrite marr off minus >> posDecimal marr (off+1) n j
| otherwise = positive i
positive :: (Integral a) => a -> Builder
{-# SPECIALIZE positive :: Int -> Builder #-}
{-# SPECIALIZE positive :: Int8 -> Builder #-}
{-# SPECIALIZE positive :: Int16 -> Builder #-}
{-# SPECIALIZE positive :: Int32 -> Builder #-}
{-# SPECIALIZE positive :: Int64 -> Builder #-}
{-# SPECIALIZE positive :: Word -> Builder #-}
{-# SPECIALIZE positive :: Word8 -> Builder #-}
{-# SPECIALIZE positive :: Word16 -> Builder #-}
{-# SPECIALIZE positive :: Word32 -> Builder #-}
{-# SPECIALIZE positive :: Word64 -> Builder #-}
positive i
| i < 10 = writeN 1 $ \marr off -> unsafeWrite marr off (i2w i)
| otherwise = let !n = countDigits i
in writeN n $ \marr off -> posDecimal marr off n i
posDecimal :: (Integral a) =>
forall s. MArray s -> Int -> Int -> a -> ST s ()
{-# INLINE posDecimal #-}
posDecimal marr off0 ds v0 = go (off0 + ds - 1) v0
where go off v
| v >= 100 = do
let (q, r) = v `quotRem` 100
write2 off r
go (off - 2) q
| v < 10 = unsafeWrite marr off (i2w v)
| otherwise = write2 off v
write2 off i0 = do
let i = fromIntegral i0; j = i + i
unsafeWrite marr off $ get (j + 1)
unsafeWrite marr (off - 1) $ get j
get = fromIntegral . B.unsafeIndex digits
minus, zero :: Word16
{-# INLINE minus #-}
{-# INLINE zero #-}
minus = 45
zero = 48
i2w :: (Integral a) => a -> Word16
{-# INLINE i2w #-}
i2w v = zero + fromIntegral v
countDigits :: (Integral a) => a -> Int
{-# INLINE countDigits #-}
countDigits v0
| fromIntegral v64 == v0 = go 1 v64
| otherwise = goBig 1 (fromIntegral v0)
where v64 = fromIntegral v0
goBig !k (v :: Integer)
| v > big = goBig (k + 19) (v `quot` big)
| otherwise = go k (fromIntegral v)
big = 10000000000000000000
go !k (v :: Word64)
| v < 10 = k
| v < 100 = k + 1
| v < 1000 = k + 2
| v < 1000000000000 =
k + if v < 100000000
then if v < 1000000
then if v < 10000
then 3
else 4 + fin v 100000
else 6 + fin v 10000000
else if v < 10000000000
then 8 + fin v 1000000000
else 10 + fin v 100000000000
| otherwise = go (k + 12) (v `quot` 1000000000000)
fin v n = if v >= n then 1 else 0
hexadecimal :: Integral a => a -> Builder
{-# SPECIALIZE hexadecimal :: Int -> Builder #-}
{-# SPECIALIZE hexadecimal :: Int8 -> Builder #-}
{-# SPECIALIZE hexadecimal :: Int16 -> Builder #-}
{-# SPECIALIZE hexadecimal :: Int32 -> Builder #-}
{-# SPECIALIZE hexadecimal :: Int64 -> Builder #-}
{-# SPECIALIZE hexadecimal :: Word -> Builder #-}
{-# SPECIALIZE hexadecimal :: Word8 -> Builder #-}
{-# SPECIALIZE hexadecimal :: Word16 -> Builder #-}
{-# SPECIALIZE hexadecimal :: Word32 -> Builder #-}
{-# SPECIALIZE hexadecimal :: Word64 -> Builder #-}
{-# RULES "hexadecimal/Integer"
hexadecimal = hexInteger :: Integer -> Builder #-}
hexadecimal i
| i < 0 = error hexErrMsg
| otherwise = go i
where
go n | n < 16 = hexDigit n
| otherwise = go (n `quot` 16) <> hexDigit (n `rem` 16)
{-# NOINLINE[0] hexadecimal #-}
hexInteger :: Integer -> Builder
hexInteger i
| i < 0 = error hexErrMsg
| otherwise = integer 16 i
hexErrMsg :: String
hexErrMsg = "Data.Text.Lazy.Builder.Int.hexadecimal: applied to negative number"
hexDigit :: Integral a => a -> Builder
hexDigit n
| n <= 9 = singleton $! i2d (fromIntegral n)
| otherwise = singleton $! toEnum (fromIntegral n + 87)
{-# INLINE hexDigit #-}
data T = T !Integer !Int
integer :: Int -> Integer -> Builder
#ifdef INTEGER_GMP
integer 10 (S# i#) = decimal (I# i#)
integer 16 (S# i#) = hexadecimal (I# i#)
#endif
integer base i
| i < 0 = singleton '-' <> go (-i)
| otherwise = go i
where
go n | n < maxInt = int (fromInteger n)
| otherwise = putH (splitf (maxInt * maxInt) n)
splitf p n
| p > n = [n]
| otherwise = splith p (splitf (p*p) n)
splith p (n:ns) = case n `quotRemInteger` p of
PAIR(q,r) | q > 0 -> q : r : splitb p ns
| otherwise -> r : splitb p ns
splith _ _ = error "splith: the impossible happened."
splitb p (n:ns) = case n `quotRemInteger` p of
PAIR(q,r) -> q : r : splitb p ns
splitb _ _ = []
T maxInt10 maxDigits10 =
until ((>mi) . (*10) . fstT) (\(T n d) -> T (n*10) (d+1)) (T 10 1)
where mi = fromIntegral (maxBound :: Int)
T maxInt16 maxDigits16 =
until ((>mi) . (*16) . fstT) (\(T n d) -> T (n*16) (d+1)) (T 16 1)
where mi = fromIntegral (maxBound :: Int)
fstT (T a _) = a
maxInt | base == 10 = maxInt10
| otherwise = maxInt16
maxDigits | base == 10 = maxDigits10
| otherwise = maxDigits16
putH (n:ns) = case n `quotRemInteger` maxInt of
PAIR(x,y)
| q > 0 -> int q <> pblock r <> putB ns
| otherwise -> int r <> putB ns
where q = fromInteger x
r = fromInteger y
putH _ = error "putH: the impossible happened"
putB (n:ns) = case n `quotRemInteger` maxInt of
PAIR(x,y) -> pblock q <> pblock r <> putB ns
where q = fromInteger x
r = fromInteger y
putB _ = Data.Monoid.mempty
int :: Int -> Builder
int x | base == 10 = decimal x
| otherwise = hexadecimal x
pblock = loop maxDigits
where
loop !d !n
| d == 1 = hexDigit n
| otherwise = loop (d-1) q <> hexDigit r
where q = n `quotInt` base
r = n `remInt` base

View File

@ -1,253 +0,0 @@
{-# LANGUAGE CPP, OverloadedStrings #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module: Data.Text.Lazy.Builder.RealFloat
-- Copyright: (c) The University of Glasgow 1994-2002
-- License: see libraries/base/LICENSE
--
-- Write a floating point value to a 'Builder'.
module Data.Text.Lazy.Builder.RealFloat
(
FPFormat(..)
, realFloat
, formatRealFloat
) where
import Data.Array.Base (unsafeAt)
import Data.Array.IArray
import Data.Text.Internal.Builder.Functions ((<>), i2d)
import Data.Text.Lazy.Builder.Int (decimal)
import Data.Text.Internal.Builder.RealFloat.Functions (roundTo)
import Data.Text.Lazy.Builder
import qualified Data.Text as T
#if MIN_VERSION_base(4,11,0)
import Prelude hiding ((<>))
#endif
-- | Control the rendering of floating point numbers.
data FPFormat = Exponent
-- ^ Scientific notation (e.g. @2.3e123@).
| Fixed
-- ^ Standard decimal notation.
| Generic
-- ^ Use decimal notation for values between @0.1@ and
-- @9,999,999@, and scientific notation otherwise.
deriving (Enum, Read, Show)
-- | Show a signed 'RealFloat' value to full precision,
-- using standard decimal notation for arguments whose absolute value lies
-- between @0.1@ and @9,999,999@, and scientific notation otherwise.
realFloat :: (RealFloat a) => a -> Builder
{-# SPECIALIZE realFloat :: Float -> Builder #-}
{-# SPECIALIZE realFloat :: Double -> Builder #-}
realFloat x = formatRealFloat Generic Nothing x
-- | Encode a signed 'RealFloat' according to 'FPFormat' and optionally requested precision.
--
-- This corresponds to the @show{E,F,G}Float@ operations provided by @base@'s "Numeric" module.
--
-- __NOTE__: The functions in @base-4.12@ changed the serialisation in
-- case of a @Just 0@ precision; this version of @text@ still provides
-- the serialisation as implemented in @base-4.11@. The next major
-- version of @text@ will switch to the more correct @base-4.12@ serialisation.
formatRealFloat :: (RealFloat a) =>
FPFormat
-> Maybe Int -- ^ Number of decimal places to render.
-> a
-> Builder
{-# SPECIALIZE formatRealFloat :: FPFormat -> Maybe Int -> Float -> Builder #-}
{-# SPECIALIZE formatRealFloat :: FPFormat -> Maybe Int -> Double -> Builder #-}
formatRealFloat fmt decs x
| isNaN x = "NaN"
| isInfinite x = if x < 0 then "-Infinity" else "Infinity"
| x < 0 || isNegativeZero x = singleton '-' <> doFmt fmt (floatToDigits (-x))
| otherwise = doFmt fmt (floatToDigits x)
where
doFmt format (is, e) =
let ds = map i2d is in
case format of
Generic ->
doFmt (if e < 0 || e > 7 then Exponent else Fixed)
(is,e)
Exponent ->
case decs of
Nothing ->
let show_e' = decimal (e-1) in
case ds of
"0" -> "0.0e0"
[d] -> singleton d <> ".0e" <> show_e'
(d:ds') -> singleton d <> singleton '.' <> fromString ds' <> singleton 'e' <> show_e'
[] -> error "formatRealFloat/doFmt/Exponent/Nothing: []"
Just dec ->
let dec' = max dec 1 in
case is of
[0] -> "0." <> fromText (T.replicate dec' "0") <> "e0"
_ ->
let (ei,is') = roundTo (dec'+1) is
is'' = map i2d (if ei > 0 then init is' else is')
in case is'' of
[] -> error "formatRealFloat/doFmt/Exponent/Just: []"
(d:ds') -> singleton d <> singleton '.' <> fromString ds' <> singleton 'e' <> decimal (e-1+ei)
Fixed ->
let
mk0 ls = case ls of { "" -> "0" ; _ -> fromString ls}
in
case decs of
Nothing
| e <= 0 -> "0." <> fromText (T.replicate (-e) "0") <> fromString ds
| otherwise ->
let
f 0 s rs = mk0 (reverse s) <> singleton '.' <> mk0 rs
f n s "" = f (n-1) ('0':s) ""
f n s (r:rs) = f (n-1) (r:s) rs
in
f e "" ds
Just dec ->
let dec' = max dec 0 in
if e >= 0 then
let
(ei,is') = roundTo (dec' + e) is
(ls,rs) = splitAt (e+ei) (map i2d is')
in
mk0 ls <> (if null rs then "" else singleton '.' <> fromString rs)
else
let (ei,is') = roundTo dec' (replicate (-e) 0 ++ is)
is'' = map i2d (if ei > 0 then is' else 0:is')
in case is'' of
[] -> error "formatRealFloat/doFmt/Fixed: []"
(d:ds') -> singleton d <> (if null ds' then "" else singleton '.' <> fromString ds')
-- Based on "Printing Floating-Point Numbers Quickly and Accurately"
-- by R.G. Burger and R.K. Dybvig in PLDI 96.
-- This version uses a much slower logarithm estimator. It should be improved.
-- | 'floatToDigits' takes a base and a non-negative 'RealFloat' number,
-- and returns a list of digits and an exponent.
-- In particular, if @x>=0@, and
--
-- > floatToDigits base x = ([d1,d2,...,dn], e)
--
-- then
--
-- (1) @n >= 1@
--
-- (2) @x = 0.d1d2...dn * (base**e)@
--
-- (3) @0 <= di <= base-1@
floatToDigits :: (RealFloat a) => a -> ([Int], Int)
{-# SPECIALIZE floatToDigits :: Float -> ([Int], Int) #-}
{-# SPECIALIZE floatToDigits :: Double -> ([Int], Int) #-}
floatToDigits 0 = ([0], 0)
floatToDigits x =
let
(f0, e0) = decodeFloat x
(minExp0, _) = floatRange x
p = floatDigits x
b = floatRadix x
minExp = minExp0 - p -- the real minimum exponent
-- Haskell requires that f be adjusted so denormalized numbers
-- will have an impossibly low exponent. Adjust for this.
(f, e) =
let n = minExp - e0 in
if n > 0 then (f0 `quot` (expt b n), e0+n) else (f0, e0)
(r, s, mUp, mDn) =
if e >= 0 then
let be = expt b e in
if f == expt b (p-1) then
(f*be*b*2, 2*b, be*b, be) -- according to Burger and Dybvig
else
(f*be*2, 2, be, be)
else
if e > minExp && f == expt b (p-1) then
(f*b*2, expt b (-e+1)*2, b, 1)
else
(f*2, expt b (-e)*2, 1, 1)
k :: Int
k =
let
k0 :: Int
k0 =
if b == 2 then
-- logBase 10 2 is very slightly larger than 8651/28738
-- (about 5.3558e-10), so if log x >= 0, the approximation
-- k1 is too small, hence we add one and need one fixup step less.
-- If log x < 0, the approximation errs rather on the high side.
-- That is usually more than compensated for by ignoring the
-- fractional part of logBase 2 x, but when x is a power of 1/2
-- or slightly larger and the exponent is a multiple of the
-- denominator of the rational approximation to logBase 10 2,
-- k1 is larger than logBase 10 x. If k1 > 1 + logBase 10 x,
-- we get a leading zero-digit we don't want.
-- With the approximation 3/10, this happened for
-- 0.5^1030, 0.5^1040, ..., 0.5^1070 and values close above.
-- The approximation 8651/28738 guarantees k1 < 1 + logBase 10 x
-- for IEEE-ish floating point types with exponent fields
-- <= 17 bits and mantissae of several thousand bits, earlier
-- convergents to logBase 10 2 would fail for long double.
-- Using quot instead of div is a little faster and requires
-- fewer fixup steps for negative lx.
let lx = p - 1 + e0
k1 = (lx * 8651) `quot` 28738
in if lx >= 0 then k1 + 1 else k1
else
-- f :: Integer, log :: Float -> Float,
-- ceiling :: Float -> Int
ceiling ((log (fromInteger (f+1) :: Float) +
fromIntegral e * log (fromInteger b)) /
log 10)
--WAS: fromInt e * log (fromInteger b))
fixup n =
if n >= 0 then
if r + mUp <= expt 10 n * s then n else fixup (n+1)
else
if expt 10 (-n) * (r + mUp) <= s then n else fixup (n+1)
in
fixup k0
gen ds rn sN mUpN mDnN =
let
(dn, rn') = (rn * 10) `quotRem` sN
mUpN' = mUpN * 10
mDnN' = mDnN * 10
in
case (rn' < mDnN', rn' + mUpN' > sN) of
(True, False) -> dn : ds
(False, True) -> dn+1 : ds
(True, True) -> if rn' * 2 < sN then dn : ds else dn+1 : ds
(False, False) -> gen (dn:ds) rn' sN mUpN' mDnN'
rds =
if k >= 0 then
gen [] r (s * expt 10 k) mUp mDn
else
let bk = expt 10 (-k) in
gen [] (r * bk) s (mUp * bk) (mDn * bk)
in
(map fromIntegral (reverse rds), k)
-- Exponentiation with a cache for the most common numbers.
minExpt, maxExpt :: Int
minExpt = 0
maxExpt = 1100
expt :: Integer -> Int -> Integer
expt base n
| base == 2 && n >= minExpt && n <= maxExpt = expts `unsafeAt` n
| base == 10 && n <= maxExpt10 = expts10 `unsafeAt` n
| otherwise = base^n
expts :: Array Int Integer
expts = array (minExpt,maxExpt) [(n,2^n) | n <- [minExpt .. maxExpt]]
maxExpt10 :: Int
maxExpt10 = 324
expts10 :: Array Int Integer
expts10 = array (minExpt,maxExpt10) [(n,10^n) | n <- [minExpt .. maxExpt10]]

View File

@ -1,254 +0,0 @@
{-# LANGUAGE BangPatterns,CPP #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module : Data.Text.Lazy.Encoding
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : portable
--
-- Functions for converting lazy 'Text' values to and from lazy
-- 'ByteString', using several standard encodings.
--
-- To gain access to a much larger family of encodings, use the
-- <http://hackage.haskell.org/package/text-icu text-icu package>.
module Data.Text.Lazy.Encoding
(
-- * Decoding ByteStrings to Text
-- $strict
decodeASCII
, decodeLatin1
, decodeUtf8
, decodeUtf16LE
, decodeUtf16BE
, decodeUtf32LE
, decodeUtf32BE
-- ** Catchable failure
, decodeUtf8'
-- ** Controllable error handling
, decodeUtf8With
, decodeUtf16LEWith
, decodeUtf16BEWith
, decodeUtf32LEWith
, decodeUtf32BEWith
-- * Encoding Text to ByteStrings
, encodeUtf8
, encodeUtf16LE
, encodeUtf16BE
, encodeUtf32LE
, encodeUtf32BE
-- * Encoding Text using ByteString Builders
, encodeUtf8Builder
, encodeUtf8BuilderEscaped
) where
import Control.Exception (evaluate, try)
import Data.Monoid (Monoid(..))
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
import Data.Word (Word8)
import qualified Data.ByteString as S
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Builder.Extra as B (safeStrategy, toLazyByteStringWith)
import qualified Data.ByteString.Builder.Prim as BP
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Internal as B
import qualified Data.ByteString.Unsafe as B
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E
import qualified Data.Text.Internal.Lazy.Fusion as F
import Data.Text.Unsafe (unsafeDupablePerformIO)
-- $strict
--
-- All of the single-parameter functions for decoding bytestrings
-- encoded in one of the Unicode Transformation Formats (UTF) operate
-- in a /strict/ mode: each will throw an exception if given invalid
-- input.
--
-- Each function has a variant, whose name is suffixed with -'With',
-- that gives greater control over the handling of decoding errors.
-- For instance, 'decodeUtf8' will throw an exception, but
-- 'decodeUtf8With' allows the programmer to determine what to do on a
-- decoding error.
-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
-- encoded text.
decodeASCII :: B.ByteString -> Text
decodeASCII = decodeUtf8
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
decodeLatin1 :: B.ByteString -> Text
decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
-- | Decode a 'ByteString' containing UTF-8 encoded text.
decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
#if defined(ASTERIUS)
decodeUtf8With onErr lbs = Chunk (TE.decodeUtf8With onErr (B.toStrict lbs)) Empty
#else
decodeUtf8With onErr (B.Chunk b0 bs0) =
case TE.streamDecodeUtf8With onErr b0 of
TE.Some t l f -> chunk t (go f l bs0)
where
go f0 _ (B.Chunk b bs) =
case f0 b of
TE.Some t l f -> chunk t (go f l bs)
go _ l _
| S.null l = empty
| otherwise = case onErr desc (Just (B.unsafeHead l)) of
Nothing -> empty
Just c -> Chunk (T.singleton c) Empty
desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
decodeUtf8With _ _ = empty
#endif
-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
-- to be valid.
--
-- If the input contains any invalid UTF-8 data, an exception will be
-- thrown that cannot be caught in pure code. For more control over
-- the handling of invalid data, use 'decodeUtf8'' or
-- 'decodeUtf8With'.
decodeUtf8 :: B.ByteString -> Text
decodeUtf8 = decodeUtf8With strictDecode
{-# INLINE[0] decodeUtf8 #-}
-- This rule seems to cause performance loss.
{- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}
-- | Decode a 'ByteString' containing UTF-8 encoded text..
--
-- If the input contains any invalid UTF-8 data, the relevant
-- exception will be returned, otherwise the decoded text.
--
-- /Note/: this function is /not/ lazy, as it must decode its entire
-- input before it can return a result. If you need lazy (streaming)
-- decoding, use 'decodeUtf8With' in lenient mode.
decodeUtf8' :: B.ByteString -> Either UnicodeException Text
decodeUtf8' bs = unsafeDupablePerformIO $ do
let t = decodeUtf8 bs
try (evaluate (rnf t `seq` t))
where
rnf Empty = ()
rnf (Chunk _ ts) = rnf ts
{-# INLINE decodeUtf8' #-}
-- | Encode text using UTF-8 encoding.
encodeUtf8 :: Text -> B.ByteString
encodeUtf8 Empty = B.empty
encodeUtf8 lt@(Chunk t _) =
B.toLazyByteStringWith strategy B.empty $ encodeUtf8Builder lt
where
-- To improve our small string performance, we use a strategy that
-- allocates a buffer that is guaranteed to be large enough for the
-- encoding of the first chunk, but not larger than the default
-- B.smallChunkSize. We clamp the firstChunkSize to ensure that we don't
-- generate too large buffers which hamper streaming.
firstChunkSize = min B.smallChunkSize (4 * (T.length t + 1))
strategy = B.safeStrategy firstChunkSize B.defaultChunkSize
-- | Encode text to a ByteString 'B.Builder' using UTF-8 encoding.
--
-- @since 1.1.0.0
encodeUtf8Builder :: Text -> B.Builder
encodeUtf8Builder =
foldrChunks (\c b -> TE.encodeUtf8Builder c `mappend` b) Data.Monoid.mempty
-- | Encode text using UTF-8 encoding and escape the ASCII characters using
-- a 'BP.BoundedPrim'.
--
-- Use this function is to implement efficient encoders for text-based formats
-- like JSON or HTML.
--
-- @since 1.1.0.0
{-# INLINE encodeUtf8BuilderEscaped #-}
encodeUtf8BuilderEscaped :: BP.BoundedPrim Word8 -> Text -> B.Builder
encodeUtf8BuilderEscaped prim =
foldrChunks (\c b -> TE.encodeUtf8BuilderEscaped prim c `mappend` b) mempty
-- | Decode text from little endian UTF-16 encoding.
decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
{-# INLINE decodeUtf16LEWith #-}
-- | Decode text from little endian UTF-16 encoding.
--
-- If the input contains any invalid little endian UTF-16 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf16LEWith'.
decodeUtf16LE :: B.ByteString -> Text
decodeUtf16LE = decodeUtf16LEWith strictDecode
{-# INLINE decodeUtf16LE #-}
-- | Decode text from big endian UTF-16 encoding.
decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
{-# INLINE decodeUtf16BEWith #-}
-- | Decode text from big endian UTF-16 encoding.
--
-- If the input contains any invalid big endian UTF-16 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf16BEWith'.
decodeUtf16BE :: B.ByteString -> Text
decodeUtf16BE = decodeUtf16BEWith strictDecode
{-# INLINE decodeUtf16BE #-}
-- | Encode text using little endian UTF-16 encoding.
encodeUtf16LE :: Text -> B.ByteString
encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)
{-# INLINE encodeUtf16LE #-}
-- | Encode text using big endian UTF-16 encoding.
encodeUtf16BE :: Text -> B.ByteString
encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)
{-# INLINE encodeUtf16BE #-}
-- | Decode text from little endian UTF-32 encoding.
decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
{-# INLINE decodeUtf32LEWith #-}
-- | Decode text from little endian UTF-32 encoding.
--
-- If the input contains any invalid little endian UTF-32 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf32LEWith'.
decodeUtf32LE :: B.ByteString -> Text
decodeUtf32LE = decodeUtf32LEWith strictDecode
{-# INLINE decodeUtf32LE #-}
-- | Decode text from big endian UTF-32 encoding.
decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
{-# INLINE decodeUtf32BEWith #-}
-- | Decode text from big endian UTF-32 encoding.
--
-- If the input contains any invalid big endian UTF-32 data, an
-- exception will be thrown. For more control over the handling of
-- invalid data, use 'decodeUtf32BEWith'.
decodeUtf32BE :: B.ByteString -> Text
decodeUtf32BE = decodeUtf32BEWith strictDecode
{-# INLINE decodeUtf32BE #-}
-- | Encode text using little endian UTF-32 encoding.
encodeUtf32LE :: Text -> B.ByteString
encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
{-# INLINE encodeUtf32LE #-}
-- | Encode text using big endian UTF-32 encoding.
encodeUtf32BE :: Text -> B.ByteString
encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
{-# INLINE encodeUtf32BE #-}

View File

@ -1,195 +0,0 @@
{-# LANGUAGE BangPatterns, CPP, RecordWildCards #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module : Data.Text.Lazy.IO
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan,
-- (c) 2009 Simon Marlow
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : GHC
--
-- Efficient locale-sensitive support for lazy text I\/O.
--
-- Skip past the synopsis for some important notes on performance and
-- portability across different versions of GHC.
module Data.Text.Lazy.IO
(
-- * Performance
-- $performance
-- * Locale support
-- $locale
-- * File-at-a-time operations
readFile
, writeFile
, appendFile
-- * Operations on handles
, hGetContents
, hGetLine
, hPutStr
, hPutStrLn
-- * Special cases for standard input and output
, interact
, getContents
, getLine
, putStr
, putStrLn
) where
import Data.Text.Lazy (Text)
import Prelude hiding (appendFile, getContents, getLine, interact,
putStr, putStrLn, readFile, writeFile)
import System.IO (Handle, IOMode(..), hPutChar, openFile, stdin, stdout,
withFile)
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as L
import qualified Control.Exception as E
import Control.Monad (when)
import Data.IORef (readIORef)
import Data.Text.Internal.IO (hGetLineWith, readChunk)
import Data.Text.Internal.Lazy (chunk, empty)
import GHC.IO.Buffer (isEmptyBuffer)
import GHC.IO.Exception (IOException(..), IOErrorType(..), ioException)
import GHC.IO.Handle.Internals (augmentIOError, hClose_help,
wantReadableHandle, withHandle)
import GHC.IO.Handle.Types (Handle__(..), HandleType(..))
import System.IO (BufferMode(..), hGetBuffering, hSetBuffering)
import System.IO.Error (isEOFError)
import System.IO.Unsafe (unsafeInterleaveIO)
-- $performance
--
-- The functions in this module obey the runtime system's locale,
-- character set encoding, and line ending conversion settings.
--
-- If you know in advance that you will be working with data that has
-- a specific encoding (e.g. UTF-8), and your application is highly
-- performance sensitive, you may find that it is faster to perform
-- I\/O with bytestrings and to encode and decode yourself than to use
-- the functions in this module.
--
-- Whether this will hold depends on the version of GHC you are using,
-- the platform you are working on, the data you are working with, and
-- the encodings you are using, so be sure to test for yourself.
-- | Read a file and return its contents as a string. The file is
-- read lazily, as with 'getContents'.
readFile :: FilePath -> IO Text
readFile name = openFile name ReadMode >>= hGetContents
-- | Write a string to a file. The file is truncated to zero length
-- before writing begins.
writeFile :: FilePath -> Text -> IO ()
writeFile p = withFile p WriteMode . flip hPutStr
-- | Write a string the end of a file.
appendFile :: FilePath -> Text -> IO ()
appendFile p = withFile p AppendMode . flip hPutStr
-- | Lazily read the remaining contents of a 'Handle'. The 'Handle'
-- will be closed after the read completes, or on error.
hGetContents :: Handle -> IO Text
hGetContents h = do
chooseGoodBuffering h
wantReadableHandle "hGetContents" h $ \hh -> do
ts <- lazyRead h
return (hh{haType=SemiClosedHandle}, ts)
-- | Use a more efficient buffer size if we're reading in
-- block-buffered mode with the default buffer size.
chooseGoodBuffering :: Handle -> IO ()
chooseGoodBuffering h = do
bufMode <- hGetBuffering h
when (bufMode == BlockBuffering Nothing) $
hSetBuffering h (BlockBuffering (Just 16384))
lazyRead :: Handle -> IO Text
lazyRead h = unsafeInterleaveIO $
withHandle "hGetContents" h $ \hh -> do
case haType hh of
ClosedHandle -> return (hh, L.empty)
SemiClosedHandle -> lazyReadBuffered h hh
_ -> ioException
(IOError (Just h) IllegalOperation "hGetContents"
"illegal handle type" Nothing Nothing)
lazyReadBuffered :: Handle -> Handle__ -> IO (Handle__, Text)
lazyReadBuffered h hh@Handle__{..} = do
buf <- readIORef haCharBuffer
(do t <- readChunk hh buf
ts <- lazyRead h
return (hh, chunk t ts)) `E.catch` \e -> do
(hh', _) <- hClose_help hh
if isEOFError e
then return $ if isEmptyBuffer buf
then (hh', empty)
else (hh', L.singleton '\r')
else E.throwIO (augmentIOError e "hGetContents" h)
-- | Read a single line from a handle.
hGetLine :: Handle -> IO Text
hGetLine = hGetLineWith L.fromChunks
-- | Write a string to a handle.
hPutStr :: Handle -> Text -> IO ()
hPutStr h = mapM_ (T.hPutStr h) . L.toChunks
-- | Write a string to a handle, followed by a newline.
hPutStrLn :: Handle -> Text -> IO ()
hPutStrLn h t = hPutStr h t >> hPutChar h '\n'
-- | The 'interact' function takes a function of type @Text -> Text@
-- as its argument. The entire input from the standard input device is
-- passed (lazily) to this function as its argument, and the resulting
-- string is output on the standard output device.
interact :: (Text -> Text) -> IO ()
interact f = putStr . f =<< getContents
-- | Lazily read all user input on 'stdin' as a single string.
getContents :: IO Text
getContents = hGetContents stdin
-- | Read a single line of user input from 'stdin'.
getLine :: IO Text
getLine = hGetLine stdin
-- | Write a string to 'stdout'.
putStr :: Text -> IO ()
putStr = hPutStr stdout
-- | Write a string to 'stdout', followed by a newline.
putStrLn :: Text -> IO ()
putStrLn = hPutStrLn stdout
-- $locale
--
-- /Note/: The behaviour of functions in this module depends on the
-- version of GHC you are using.
--
-- Beginning with GHC 6.12, text I\/O is performed using the system or
-- handle's current locale and line ending conventions.
--
-- Under GHC 6.10 and earlier, the system I\/O libraries /do not
-- support/ locale-sensitive I\/O or line ending conversion. On these
-- versions of GHC, functions in this library all use UTF-8. What
-- does this mean in practice?
--
-- * All data that is read will be decoded as UTF-8.
--
-- * Before data is written, it is first encoded as UTF-8.
--
-- * On both reading and writing, the platform's native newline
-- conversion is performed.
--
-- If you must use a non-UTF-8 locale on an older version of GHC, you
-- will have to perform the transcoding yourself, e.g. as follows:
--
-- > import qualified Data.ByteString.Lazy as B
-- > import Data.Text.Lazy (Text)
-- > import Data.Text.Lazy.Encoding (encodeUtf16)
-- >
-- > putStr_Utf16LE :: Text -> IO ()
-- > putStr_Utf16LE t = B.putStr (encodeUtf16LE t)

View File

@ -1,20 +0,0 @@
{-# LANGUAGE BangPatterns, DeriveDataTypeable #-}
-- |
-- Module : Data.Text.Lazy.Internal
-- Copyright : (c) 2013 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- This module has been renamed to 'Data.Text.Internal.Lazy'. This
-- name for the module will be removed in the next major release.
module Data.Text.Lazy.Internal
{-# DEPRECATED "Use Data.Text.Internal.Lazy instead" #-}
(
module Data.Text.Internal.Lazy
) where
import Data.Text.Internal.Lazy

View File

@ -1,192 +0,0 @@
{-# LANGUAGE OverloadedStrings, CPP #-}
#if __GLASGOW_HASKELL__ >= 704
{-# LANGUAGE Safe #-}
#elif __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module : Data.Text.Lazy.Read
-- Copyright : (c) 2010, 2011 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : GHC
--
-- Functions used frequently when reading textual data.
module Data.Text.Lazy.Read
(
Reader
, decimal
, hexadecimal
, signed
, rational
, double
) where
import Control.Monad (liftM)
import Data.Char (isDigit, isHexDigit)
import Data.Int (Int8, Int16, Int32, Int64)
import Data.Ratio ((%))
import Data.Text.Internal.Read
import Data.Text.Lazy as T
import Data.Word (Word, Word8, Word16, Word32, Word64)
-- | Read some text. If the read succeeds, return its value and the
-- remaining text, otherwise an error message.
type Reader a = IReader Text a
type Parser = IParser Text
-- | Read a decimal integer. The input must begin with at least one
-- decimal digit, and is consumed until a non-digit or end of string
-- is reached.
--
-- This function does not handle leading sign characters. If you need
-- to handle signed input, use @'signed' 'decimal'@.
--
-- /Note/: For fixed-width integer types, this function does not
-- attempt to detect overflow, so a sufficiently long input may give
-- incorrect results. If you are worried about overflow, use
-- 'Integer' for your result type.
decimal :: Integral a => Reader a
{-# SPECIALIZE decimal :: Reader Int #-}
{-# SPECIALIZE decimal :: Reader Int8 #-}
{-# SPECIALIZE decimal :: Reader Int16 #-}
{-# SPECIALIZE decimal :: Reader Int32 #-}
{-# SPECIALIZE decimal :: Reader Int64 #-}
{-# SPECIALIZE decimal :: Reader Integer #-}
{-# SPECIALIZE decimal :: Reader Data.Word.Word #-}
{-# SPECIALIZE decimal :: Reader Word8 #-}
{-# SPECIALIZE decimal :: Reader Word16 #-}
{-# SPECIALIZE decimal :: Reader Word32 #-}
{-# SPECIALIZE decimal :: Reader Word64 #-}
decimal txt
| T.null h = Left "input does not start with a digit"
| otherwise = Right (T.foldl' go 0 h, t)
where (h,t) = T.span isDigit txt
go n d = (n * 10 + fromIntegral (digitToInt d))
-- | Read a hexadecimal integer, consisting of an optional leading
-- @\"0x\"@ followed by at least one hexadecimal digit. Input is
-- consumed until a non-hex-digit or end of string is reached.
-- This function is case insensitive.
--
-- This function does not handle leading sign characters. If you need
-- to handle signed input, use @'signed' 'hexadecimal'@.
--
-- /Note/: For fixed-width integer types, this function does not
-- attempt to detect overflow, so a sufficiently long input may give
-- incorrect results. If you are worried about overflow, use
-- 'Integer' for your result type.
hexadecimal :: Integral a => Reader a
{-# SPECIALIZE hexadecimal :: Reader Int #-}
{-# SPECIALIZE hexadecimal :: Reader Integer #-}
hexadecimal txt
| h == "0x" || h == "0X" = hex t
| otherwise = hex txt
where (h,t) = T.splitAt 2 txt
hex :: Integral a => Reader a
{-# SPECIALIZE hexadecimal :: Reader Int #-}
{-# SPECIALIZE hexadecimal :: Reader Int8 #-}
{-# SPECIALIZE hexadecimal :: Reader Int16 #-}
{-# SPECIALIZE hexadecimal :: Reader Int32 #-}
{-# SPECIALIZE hexadecimal :: Reader Int64 #-}
{-# SPECIALIZE hexadecimal :: Reader Integer #-}
{-# SPECIALIZE hexadecimal :: Reader Word #-}
{-# SPECIALIZE hexadecimal :: Reader Word8 #-}
{-# SPECIALIZE hexadecimal :: Reader Word16 #-}
{-# SPECIALIZE hexadecimal :: Reader Word32 #-}
{-# SPECIALIZE hexadecimal :: Reader Word64 #-}
hex txt
| T.null h = Left "input does not start with a hexadecimal digit"
| otherwise = Right (T.foldl' go 0 h, t)
where (h,t) = T.span isHexDigit txt
go n d = (n * 16 + fromIntegral (hexDigitToInt d))
-- | Read an optional leading sign character (@\'-\'@ or @\'+\'@) and
-- apply it to the result of applying the given reader.
signed :: Num a => Reader a -> Reader a
{-# INLINE signed #-}
signed f = runP (signa (P f))
-- | Read a rational number.
--
-- This function accepts an optional leading sign character, followed
-- by at least one decimal digit. The syntax similar to that accepted
-- by the 'read' function, with the exception that a trailing @\'.\'@
-- or @\'e\'@ /not/ followed by a number is not consumed.
--
-- Examples:
--
-- >rational "3" == Right (3.0, "")
-- >rational "3.1" == Right (3.1, "")
-- >rational "3e4" == Right (30000.0, "")
-- >rational "3.1e4" == Right (31000.0, "")
-- >rational ".3" == Left "input does not start with a digit"
-- >rational "e3" == Left "input does not start with a digit"
--
-- Examples of differences from 'read':
--
-- >rational "3.foo" == Right (3.0, ".foo")
-- >rational "3e" == Right (3.0, "e")
rational :: Fractional a => Reader a
{-# SPECIALIZE rational :: Reader Double #-}
rational = floaty $ \real frac fracDenom -> fromRational $
real % 1 + frac % fracDenom
-- | Read a rational number.
--
-- The syntax accepted by this function is the same as for 'rational'.
--
-- /Note/: This function is almost ten times faster than 'rational',
-- but is slightly less accurate.
--
-- The 'Double' type supports about 16 decimal places of accuracy.
-- For 94.2% of numbers, this function and 'rational' give identical
-- results, but for the remaining 5.8%, this function loses precision
-- around the 15th decimal place. For 0.001% of numbers, this
-- function will lose precision at the 13th or 14th decimal place.
double :: Reader Double
double = floaty $ \real frac fracDenom ->
fromIntegral real +
fromIntegral frac / fromIntegral fracDenom
signa :: Num a => Parser a -> Parser a
{-# SPECIALIZE signa :: Parser Int -> Parser Int #-}
{-# SPECIALIZE signa :: Parser Int8 -> Parser Int8 #-}
{-# SPECIALIZE signa :: Parser Int16 -> Parser Int16 #-}
{-# SPECIALIZE signa :: Parser Int32 -> Parser Int32 #-}
{-# SPECIALIZE signa :: Parser Int64 -> Parser Int64 #-}
{-# SPECIALIZE signa :: Parser Integer -> Parser Integer #-}
signa p = do
sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
if sign == '+' then p else negate `liftM` p
char :: (Char -> Bool) -> Parser Char
char p = P $ \t -> case T.uncons t of
Just (c,t') | p c -> Right (c,t')
_ -> Left "character does not match"
floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Reader a
{-# INLINE floaty #-}
floaty f = runP $ do
sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
real <- P decimal
T fraction fracDigits <- perhaps (T 0 0) $ do
_ <- char (=='.')
digits <- P $ \t -> Right (fromIntegral . T.length $ T.takeWhile isDigit t, t)
n <- P decimal
return $ T n digits
let e c = c == 'e' || c == 'E'
power <- perhaps 0 (char e >> signa (P decimal) :: Parser Int)
let n = if fracDigits == 0
then if power == 0
then fromIntegral real
else fromIntegral real * (10 ^^ power)
else if power == 0
then f real fraction (10 ^ fracDigits)
else f real fraction (10 ^ fracDigits) * (10 ^^ power)
return $! if sign == '+'
then n
else -n

View File

@ -1,200 +0,0 @@
{-# LANGUAGE OverloadedStrings, UnboxedTuples, CPP #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module : Data.Text.Read
-- Copyright : (c) 2010, 2011 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : GHC
--
-- Functions used frequently when reading textual data.
module Data.Text.Read
(
Reader
, decimal
, hexadecimal
, signed
, rational
, double
) where
import Control.Monad (liftM)
import Data.Char (isDigit, isHexDigit)
import Data.Int (Int8, Int16, Int32, Int64)
import Data.Ratio ((%))
import Data.Text as T
import Data.Text.Internal.Private (span_)
import Data.Text.Internal.Read
import Data.Word (Word, Word8, Word16, Word32, Word64)
-- | Read some text. If the read succeeds, return its value and the
-- remaining text, otherwise an error message.
type Reader a = IReader Text a
type Parser a = IParser Text a
-- | Read a decimal integer. The input must begin with at least one
-- decimal digit, and is consumed until a non-digit or end of string
-- is reached.
--
-- This function does not handle leading sign characters. If you need
-- to handle signed input, use @'signed' 'decimal'@.
--
-- /Note/: For fixed-width integer types, this function does not
-- attempt to detect overflow, so a sufficiently long input may give
-- incorrect results. If you are worried about overflow, use
-- 'Integer' for your result type.
decimal :: Integral a => Reader a
{-# SPECIALIZE decimal :: Reader Int #-}
{-# SPECIALIZE decimal :: Reader Int8 #-}
{-# SPECIALIZE decimal :: Reader Int16 #-}
{-# SPECIALIZE decimal :: Reader Int32 #-}
{-# SPECIALIZE decimal :: Reader Int64 #-}
{-# SPECIALIZE decimal :: Reader Integer #-}
{-# SPECIALIZE decimal :: Reader Data.Word.Word #-}
{-# SPECIALIZE decimal :: Reader Word8 #-}
{-# SPECIALIZE decimal :: Reader Word16 #-}
{-# SPECIALIZE decimal :: Reader Word32 #-}
{-# SPECIALIZE decimal :: Reader Word64 #-}
decimal txt
| T.null h = Left "input does not start with a digit"
| otherwise = Right (T.foldl' go 0 h, t)
where (# h,t #) = span_ isDigit txt
go n d = (n * 10 + fromIntegral (digitToInt d))
-- | Read a hexadecimal integer, consisting of an optional leading
-- @\"0x\"@ followed by at least one hexadecimal digit. Input is
-- consumed until a non-hex-digit or end of string is reached.
-- This function is case insensitive.
--
-- This function does not handle leading sign characters. If you need
-- to handle signed input, use @'signed' 'hexadecimal'@.
--
-- /Note/: For fixed-width integer types, this function does not
-- attempt to detect overflow, so a sufficiently long input may give
-- incorrect results. If you are worried about overflow, use
-- 'Integer' for your result type.
hexadecimal :: Integral a => Reader a
{-# SPECIALIZE hexadecimal :: Reader Int #-}
{-# SPECIALIZE hexadecimal :: Reader Int8 #-}
{-# SPECIALIZE hexadecimal :: Reader Int16 #-}
{-# SPECIALIZE hexadecimal :: Reader Int32 #-}
{-# SPECIALIZE hexadecimal :: Reader Int64 #-}
{-# SPECIALIZE hexadecimal :: Reader Integer #-}
{-# SPECIALIZE hexadecimal :: Reader Word #-}
{-# SPECIALIZE hexadecimal :: Reader Word8 #-}
{-# SPECIALIZE hexadecimal :: Reader Word16 #-}
{-# SPECIALIZE hexadecimal :: Reader Word32 #-}
{-# SPECIALIZE hexadecimal :: Reader Word64 #-}
hexadecimal txt
| h == "0x" || h == "0X" = hex t
| otherwise = hex txt
where (h,t) = T.splitAt 2 txt
hex :: Integral a => Reader a
{-# SPECIALIZE hex :: Reader Int #-}
{-# SPECIALIZE hex :: Reader Int8 #-}
{-# SPECIALIZE hex :: Reader Int16 #-}
{-# SPECIALIZE hex :: Reader Int32 #-}
{-# SPECIALIZE hex :: Reader Int64 #-}
{-# SPECIALIZE hex :: Reader Integer #-}
{-# SPECIALIZE hex :: Reader Word #-}
{-# SPECIALIZE hex :: Reader Word8 #-}
{-# SPECIALIZE hex :: Reader Word16 #-}
{-# SPECIALIZE hex :: Reader Word32 #-}
{-# SPECIALIZE hex :: Reader Word64 #-}
hex txt
| T.null h = Left "input does not start with a hexadecimal digit"
| otherwise = Right (T.foldl' go 0 h, t)
where (# h,t #) = span_ isHexDigit txt
go n d = (n * 16 + fromIntegral (hexDigitToInt d))
-- | Read an optional leading sign character (@\'-\'@ or @\'+\'@) and
-- apply it to the result of applying the given reader.
signed :: Num a => Reader a -> Reader a
{-# INLINE signed #-}
signed f = runP (signa (P f))
-- | Read a rational number.
--
-- This function accepts an optional leading sign character, followed
-- by at least one decimal digit. The syntax similar to that accepted
-- by the 'read' function, with the exception that a trailing @\'.\'@
-- or @\'e\'@ /not/ followed by a number is not consumed.
--
-- Examples (with behaviour identical to 'read'):
--
-- >rational "3" == Right (3.0, "")
-- >rational "3.1" == Right (3.1, "")
-- >rational "3e4" == Right (30000.0, "")
-- >rational "3.1e4" == Right (31000.0, "")
-- >rational ".3" == Left "input does not start with a digit"
-- >rational "e3" == Left "input does not start with a digit"
--
-- Examples of differences from 'read':
--
-- >rational "3.foo" == Right (3.0, ".foo")
-- >rational "3e" == Right (3.0, "e")
rational :: Fractional a => Reader a
{-# SPECIALIZE rational :: Reader Double #-}
rational = floaty $ \real frac fracDenom -> fromRational $
real % 1 + frac % fracDenom
-- | Read a rational number.
--
-- The syntax accepted by this function is the same as for 'rational'.
--
-- /Note/: This function is almost ten times faster than 'rational',
-- but is slightly less accurate.
--
-- The 'Double' type supports about 16 decimal places of accuracy.
-- For 94.2% of numbers, this function and 'rational' give identical
-- results, but for the remaining 5.8%, this function loses precision
-- around the 15th decimal place. For 0.001% of numbers, this
-- function will lose precision at the 13th or 14th decimal place.
double :: Reader Double
double = floaty $ \real frac fracDenom ->
fromIntegral real +
fromIntegral frac / fromIntegral fracDenom
signa :: Num a => Parser a -> Parser a
{-# SPECIALIZE signa :: Parser Int -> Parser Int #-}
{-# SPECIALIZE signa :: Parser Int8 -> Parser Int8 #-}
{-# SPECIALIZE signa :: Parser Int16 -> Parser Int16 #-}
{-# SPECIALIZE signa :: Parser Int32 -> Parser Int32 #-}
{-# SPECIALIZE signa :: Parser Int64 -> Parser Int64 #-}
{-# SPECIALIZE signa :: Parser Integer -> Parser Integer #-}
signa p = do
sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
if sign == '+' then p else negate `liftM` p
char :: (Char -> Bool) -> Parser Char
char p = P $ \t -> case T.uncons t of
Just (c,t') | p c -> Right (c,t')
_ -> Left "character does not match"
floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Reader a
{-# INLINE floaty #-}
floaty f = runP $ do
sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
real <- P decimal
T fraction fracDigits <- perhaps (T 0 0) $ do
_ <- char (=='.')
digits <- P $ \t -> Right (T.length $ T.takeWhile isDigit t, t)
n <- P decimal
return $ T n digits
let e c = c == 'e' || c == 'E'
power <- perhaps 0 (char e >> signa (P decimal) :: Parser Int)
let n = if fracDigits == 0
then if power == 0
then fromIntegral real
else fromIntegral real * (10 ^^ power)
else if power == 0
then f real fraction (10 ^ fracDigits)
else f real fraction (10 ^ fracDigits) * (10 ^^ power)
return $! if sign == '+'
then n
else -n

View File

@ -1,90 +0,0 @@
{-# LANGUAGE CPP, MagicHash #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module : Data.Text.Show
-- Copyright : (c) 2009-2015 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
module Data.Text.Show
(
singleton
, unpack
, unpackCString#
) where
import Control.Monad.ST (ST)
import Data.Text.Internal (Text(..), empty_, safe)
import Data.Text.Internal.Fusion (stream, unstream)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import GHC.Prim (Addr#)
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.Fusion.Common as S
#if __GLASGOW_HASKELL__ >= 702
import qualified GHC.CString as GHC
#else
import qualified GHC.Base as GHC
#endif
instance Show Text where
showsPrec p ps r = showsPrec p (unpack ps) r
-- | /O(n)/ Convert a 'Text' into a 'String'. Subject to fusion.
unpack :: Text -> String
unpack = S.unstreamList . stream
{-# INLINE [1] unpack #-}
-- | /O(n)/ Convert a literal string into a 'Text'.
--
-- This is exposed solely for people writing GHC rewrite rules.
--
-- @since 1.2.1.1
unpackCString# :: Addr# -> Text
unpackCString# addr# = unstream (S.streamCString# addr#)
{-# NOINLINE unpackCString# #-}
{-# RULES "TEXT literal" [1] forall a.
unstream (S.map safe (S.streamList (GHC.unpackCString# a)))
= unpackCString# a #-}
{-# RULES "TEXT literal UTF8" [1] forall a.
unstream (S.map safe (S.streamList (GHC.unpackCStringUtf8# a)))
= unpackCString# a #-}
{-# RULES "TEXT empty literal" [1]
unstream (S.map safe (S.streamList []))
= empty_ #-}
{-# RULES "TEXT singleton literal" [1] forall a.
unstream (S.map safe (S.streamList [a]))
= singleton_ a #-}
-- | /O(1)/ Convert a character into a Text. Subject to fusion.
-- Performs replacement on invalid scalar values.
singleton :: Char -> Text
singleton = unstream . S.singleton . safe
{-# INLINE [1] singleton #-}
{-# RULES "TEXT singleton" forall a.
unstream (S.singleton (safe a))
= singleton_ a #-}
-- This is intended to reduce inlining bloat.
singleton_ :: Char -> Text
singleton_ c = Text (A.run x) 0 len
where x :: ST s (A.MArray s)
x = do arr <- A.new len
_ <- unsafeWrite arr 0 d
return arr
len | d < '\x10000' = 1
| otherwise = 2
d = safe c
{-# NOINLINE singleton_ #-}

View File

@ -1,124 +0,0 @@
{-# LANGUAGE CPP, MagicHash, UnboxedTuples #-}
-- |
-- Module : Data.Text.Unsafe
-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Portability : portable
--
-- A module containing unsafe 'Text' operations, for very very careful
-- use in heavily tested code.
module Data.Text.Unsafe
(
inlineInterleaveST
, inlinePerformIO
, unsafeDupablePerformIO
, Iter(..)
, iter
, iter_
, reverseIter
, reverseIter_
, unsafeHead
, unsafeTail
, lengthWord16
, takeWord16
, dropWord16
) where
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
import Data.Text.Internal.Encoding.Utf16 (chr2)
import Data.Text.Internal (Text(..))
import Data.Text.Internal.Unsafe (inlineInterleaveST, inlinePerformIO)
import Data.Text.Internal.Unsafe.Char (unsafeChr)
import qualified Data.Text.Array as A
import GHC.IO (unsafeDupablePerformIO)
-- | /O(1)/ A variant of 'head' for non-empty 'Text'. 'unsafeHead'
-- omits the check for the empty case, so there is an obligation on
-- the programmer to provide a proof that the 'Text' is non-empty.
unsafeHead :: Text -> Char
unsafeHead (Text arr off _len)
| m < 0xD800 || m > 0xDBFF = unsafeChr m
| otherwise = chr2 m n
where m = A.unsafeIndex arr off
n = A.unsafeIndex arr (off+1)
{-# INLINE unsafeHead #-}
-- | /O(1)/ A variant of 'tail' for non-empty 'Text'. 'unsafeTail'
-- omits the check for the empty case, so there is an obligation on
-- the programmer to provide a proof that the 'Text' is non-empty.
unsafeTail :: Text -> Text
unsafeTail t@(Text arr off len) =
#if defined(ASSERTS)
assert (d <= len) $
#endif
Text arr (off+d) (len-d)
where d = iter_ t 0
{-# INLINE unsafeTail #-}
data Iter = Iter {-# UNPACK #-} !Char {-# UNPACK #-} !Int
-- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-16
-- array, returning the current character and the delta to add to give
-- the next offset to iterate at.
iter :: Text -> Int -> Iter
iter (Text arr off _len) i
| m < 0xD800 || m > 0xDBFF = Iter (unsafeChr m) 1
| otherwise = Iter (chr2 m n) 2
where m = A.unsafeIndex arr j
n = A.unsafeIndex arr k
j = off + i
k = j + 1
{-# INLINE iter #-}
-- | /O(1)/ Iterate one step through a UTF-16 array, returning the
-- delta to add to give the next offset to iterate at.
iter_ :: Text -> Int -> Int
iter_ (Text arr off _len) i | m < 0xD800 || m > 0xDBFF = 1
| otherwise = 2
where m = A.unsafeIndex arr (off+i)
{-# INLINE iter_ #-}
-- | /O(1)/ Iterate one step backwards through a UTF-16 array,
-- returning the current character and the delta to add (i.e. a
-- negative number) to give the next offset to iterate at.
reverseIter :: Text -> Int -> (Char,Int)
reverseIter (Text arr off _len) i
| m < 0xDC00 || m > 0xDFFF = (unsafeChr m, -1)
| otherwise = (chr2 n m, -2)
where m = A.unsafeIndex arr j
n = A.unsafeIndex arr k
j = off + i
k = j - 1
{-# INLINE reverseIter #-}
-- | /O(1)/ Iterate one step backwards through a UTF-16 array,
-- returning the delta to add (i.e. a negative number) to give the
-- next offset to iterate at.
--
-- @since 1.1.1.0
reverseIter_ :: Text -> Int -> Int
reverseIter_ (Text arr off _len) i
| m < 0xDC00 || m > 0xDFFF = -1
| otherwise = -2
where m = A.unsafeIndex arr (off+i)
{-# INLINE reverseIter_ #-}
-- | /O(1)/ Return the length of a 'Text' in units of 'Word16'. This
-- is useful for sizing a target array appropriately before using
-- 'unsafeCopyToPtr'.
lengthWord16 :: Text -> Int
lengthWord16 (Text _arr _off len) = len
{-# INLINE lengthWord16 #-}
-- | /O(1)/ Unchecked take of 'k' 'Word16's from the front of a 'Text'.
takeWord16 :: Int -> Text -> Text
takeWord16 k (Text arr off _len) = Text arr off k
{-# INLINE takeWord16 #-}
-- | /O(1)/ Unchecked drop of 'k' 'Word16's from the front of a 'Text'.
dropWord16 :: Int -> Text -> Text
dropWord16 k (Text arr off len) = Text arr (off+k) (len-k)
{-# INLINE dropWord16 #-}

View File

@ -1,26 +0,0 @@
Copyright (c) 2008-2009, Tom Harper
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,31 +0,0 @@
# `text`: Fast, packed Unicode strings, using stream fusion
This package provides the Data.Text library, a library for the space-
and time-efficient manipulation of Unicode text in Haskell.
**Please refer to the [package description on Hackage](https://hackage.haskell.org/package/text#description) for more information.**
# Get involved!
Please report bugs via the
[github issue tracker](https://github.com/haskell/text/issues).
Master [git repository](https://github.com/haskell/text):
* `git clone git://github.com/haskell/text.git`
There's also a [Mercurial mirror](https://bitbucket.org/bos/text):
* `hg clone https://bitbucket.org/bos/text`
(You can create and contribute changes using either Mercurial or git.)
# Authors
The base code for this library was originally written by Tom Harper,
based on the stream fusion framework developed by Roman Leshchinskiy,
Duncan Coutts, and Don Stewart.
The core library was fleshed out, debugged, and tested by Bryan
O'Sullivan <bos@serpentine.com>, and he is the current maintainer.

View File

@ -1,3 +0,0 @@
#!/usr/bin/env runhaskell
> import Distribution.Simple
> main = defaultMain

View File

@ -1,4 +0,0 @@
/dist/
/dist-newstyle/
/.ghc.environment.*
/cabal.project.local

View File

@ -1,2 +0,0 @@
import Distribution.Simple
main = defaultMain

View File

@ -1,2 +0,0 @@
-- NB: we use a separate project
packages: .

View File

@ -1,35 +0,0 @@
#include <iconv.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
int time_iconv(char *srcbuf, size_t srcbufsize)
{
uint16_t *destbuf = NULL;
size_t destbufsize;
static uint16_t *origdestbuf;
static size_t origdestbufsize;
iconv_t ic = (iconv_t) -1;
int ret = 0;
if (ic == (iconv_t) -1) {
ic = iconv_open("UTF-16LE", "UTF-8");
if (ic == (iconv_t) -1) {
ret = -1;
goto done;
}
}
destbufsize = srcbufsize * sizeof(uint16_t);
if (destbufsize > origdestbufsize) {
free(origdestbuf);
origdestbuf = destbuf = malloc(origdestbufsize = destbufsize);
} else {
destbuf = origdestbuf;
}
iconv(ic, &srcbuf, &srcbufsize, (char**) &destbuf, &destbufsize);
done:
return ret;
}

View File

@ -1,73 +0,0 @@
-- | Main module to run the micro benchmarks
--
{-# LANGUAGE OverloadedStrings #-}
module Main
( main
) where
import Criterion.Main (defaultMain, bgroup, env)
import System.FilePath ((</>))
import System.IO (IOMode (WriteMode), openFile, hSetEncoding, utf8)
import qualified Benchmarks.Builder as Builder
import qualified Benchmarks.Concat as Concat
import qualified Benchmarks.DecodeUtf8 as DecodeUtf8
import qualified Benchmarks.EncodeUtf8 as EncodeUtf8
import qualified Benchmarks.Equality as Equality
import qualified Benchmarks.FileRead as FileRead
import qualified Benchmarks.FoldLines as FoldLines
import qualified Benchmarks.Mul as Mul
import qualified Benchmarks.Pure as Pure
import qualified Benchmarks.ReadNumbers as ReadNumbers
import qualified Benchmarks.Replace as Replace
import qualified Benchmarks.Search as Search
import qualified Benchmarks.Stream as Stream
import qualified Benchmarks.WordFrequencies as WordFrequencies
import qualified Benchmarks.Programs.BigTable as Programs.BigTable
import qualified Benchmarks.Programs.Cut as Programs.Cut
import qualified Benchmarks.Programs.Fold as Programs.Fold
import qualified Benchmarks.Programs.Sort as Programs.Sort
import qualified Benchmarks.Programs.StripTags as Programs.StripTags
import qualified Benchmarks.Programs.Throughput as Programs.Throughput
main :: IO ()
main = do
sink <- openFile "/dev/null" WriteMode
hSetEncoding sink utf8
defaultMain
[ Builder.benchmark
, Concat.benchmark
, env (DecodeUtf8.initEnv (tf "libya-chinese.html")) (DecodeUtf8.benchmark "html")
, env (DecodeUtf8.initEnv (tf "yiwiki.xml")) (DecodeUtf8.benchmark "xml")
, env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmark "ascii")
, env (DecodeUtf8.initEnv (tf "russian.txt")) (DecodeUtf8.benchmark "russian")
, env (DecodeUtf8.initEnv (tf "japanese.txt")) (DecodeUtf8.benchmark "japanese")
, EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯"
, env (Equality.initEnv (tf "japanese.txt")) Equality.benchmark
, FileRead.benchmark (tf "russian.txt")
, FoldLines.benchmark (tf "russian.txt")
, env Mul.initEnv Mul.benchmark
, env (Pure.initEnv (tf "tiny.txt")) (Pure.benchmark "tiny")
, env (Pure.initEnv (tf "ascii-small.txt")) (Pure.benchmark "ascii-small")
, env (Pure.initEnv (tf "ascii.txt")) (Pure.benchmark "ascii")
, env (Pure.initEnv (tf "english.txt")) (Pure.benchmark "english")
, env (Pure.initEnv (tf "russian-small.txt")) (Pure.benchmark "russian")
, env (Pure.initEnv (tf "japanese.txt")) (Pure.benchmark "japanese")
, env (ReadNumbers.initEnv (tf "numbers.txt")) ReadNumbers.benchmark
, env (Replace.initEnv (tf "russian.txt")) (Replace.benchmark "принимая" "своем")
, env (Search.initEnv (tf "russian.txt")) (Search.benchmark "принимая")
, env (Stream.initEnv (tf "russian.txt")) Stream.benchmark
, env (WordFrequencies.initEnv (tf "russian.txt")) WordFrequencies.benchmark
, bgroup "Programs"
[ Programs.BigTable.benchmark sink
, Programs.Cut.benchmark (tf "russian.txt") sink 20 40
, Programs.Fold.benchmark (tf "russian.txt") sink
, Programs.Sort.benchmark (tf "russian.txt") sink
, Programs.StripTags.benchmark (tf "yiwiki.xml") sink
, Programs.Throughput.benchmark (tf "russian.txt") sink
]
]
where
-- Location of a test file
tf = ("../tests/text-test-data" </>)

View File

@ -1,75 +0,0 @@
-- | Testing the internal builder monoid
--
-- Tested in this benchmark:
--
-- * Concatenating many small strings using a builder
--
{-# LANGUAGE OverloadedStrings #-}
module Benchmarks.Builder
( benchmark
) where
import Criterion (Benchmark, bgroup, bench, nf)
import Data.Binary.Builder as B
import Data.ByteString.Char8 ()
import Data.Monoid (mconcat, mempty)
import qualified Blaze.ByteString.Builder as Blaze
import qualified Blaze.ByteString.Builder.Char.Utf8 as Blaze
import qualified Data.ByteString as SB
import qualified Data.ByteString.Lazy as LB
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT
import qualified Data.Text.Lazy.Builder as LTB
import qualified Data.Text.Lazy.Builder.Int as Int
import Data.Int (Int64)
benchmark :: Benchmark
benchmark = bgroup "Builder"
[ bgroup "Comparison"
[ bench "LazyText" $ nf
(LT.length . LTB.toLazyText . mconcat . map LTB.fromText) texts
, bench "Binary" $ nf
(LB.length . B.toLazyByteString . mconcat . map B.fromByteString)
byteStrings
, bench "Blaze" $ nf
(LB.length . Blaze.toLazyByteString . mconcat . map Blaze.fromString)
strings
]
, bgroup "Int"
[ bgroup "Decimal"
[ bgroup "Positive" .
flip map numbers $ \n ->
(bench (show (length (show n))) $ nf (LTB.toLazyText . Int.decimal) n)
, bgroup "Negative" .
flip map numbers $ \m ->
let n = negate m in
(bench (show (length (show n))) $ nf (LTB.toLazyText . Int.decimal) n)
, bench "Empty" $ nf LTB.toLazyText mempty
, bgroup "Show" .
flip map numbers $ \n ->
(bench (show (length (show n))) $ nf show n)
]
]
]
where
numbers :: [Int64]
numbers = [
6, 14, 500, 9688, 10654, 620735, 5608880, 37010612,
731223504, 5061580596, 24596952933, 711732309084, 2845910093839,
54601756118340, 735159434806159, 3619097625502435, 95777227510267124,
414944309510675693, 8986407456998704019
]
texts :: [T.Text]
texts = take 200000 $ cycle ["foo", "λx", "由の"]
{-# NOINLINE texts #-}
-- Note that the non-ascii characters will be chopped
byteStrings :: [SB.ByteString]
byteStrings = take 200000 $ cycle ["foo", "λx", "由の"]
{-# NOINLINE byteStrings #-}
-- Note that the non-ascii characters will be chopped
strings :: [String]
strings = take 200000 $ cycle ["foo", "λx", "由の"]
{-# NOINLINE strings #-}

View File

@ -1,25 +0,0 @@
{-# LANGUAGE OverloadedStrings #-}
module Benchmarks.Concat (benchmark) where
import Control.Monad.Trans.Writer
import Criterion (Benchmark, bgroup, bench, whnf)
import Data.Text as T
benchmark :: Benchmark
benchmark = bgroup "Concat"
[ bench "append" $ whnf (append4 "Text 1" "Text 2" "Text 3") "Text 4"
, bench "concat" $ whnf (concat4 "Text 1" "Text 2" "Text 3") "Text 4"
, bench "write" $ whnf (write4 "Text 1" "Text 2" "Text 3") "Text 4"
]
append4, concat4, write4 :: Text -> Text -> Text -> Text -> Text
{-# NOINLINE append4 #-}
append4 x1 x2 x3 x4 = x1 `append` x2 `append` x3 `append` x4
{-# NOINLINE concat4 #-}
concat4 x1 x2 x3 x4 = T.concat [x1, x2, x3, x4]
{-# NOINLINE write4 #-}
write4 x1 x2 x3 x4 = execWriter $ tell x1 >> tell x2 >> tell x3 >> tell x4

View File

@ -1,74 +0,0 @@
{-# LANGUAGE ForeignFunctionInterface #-}
-- | Test decoding of UTF-8
--
-- Tested in this benchmark:
--
-- * Decoding bytes using UTF-8
--
-- In some tests:
--
-- * Taking the length of the result
--
-- * Taking the init of the result
--
-- The latter are used for testing stream fusion.
--
module Benchmarks.DecodeUtf8
( initEnv
, benchmark
) where
import Foreign.C.Types
import Data.ByteString.Internal (ByteString(..))
import Data.ByteString.Lazy.Internal (ByteString(..))
import Foreign.Ptr (Ptr, plusPtr)
import Foreign.ForeignPtr (withForeignPtr)
import Data.Word (Word8)
import qualified Criterion as C
import Criterion (Benchmark, bgroup, nf, whnfIO)
import qualified Codec.Binary.UTF8.Generic as U8
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TL
type Env = (B.ByteString, BL.ByteString)
initEnv :: FilePath -> IO Env
initEnv fp = do
bs <- B.readFile fp
lbs <- BL.readFile fp
return (bs, lbs)
benchmark :: String -> Env -> Benchmark
benchmark kind ~(bs, lbs) =
let bench name = C.bench (name ++ "+" ++ kind)
decodeStream (Chunk b0 bs0) = case T.streamDecodeUtf8 b0 of
T.Some t0 _ f0 -> t0 : go f0 bs0
where go f (Chunk b bs1) = case f b of
T.Some t1 _ f1 -> t1 : go f1 bs1
go _ _ = []
decodeStream _ = []
in bgroup "DecodeUtf8"
[ bench "Strict" $ nf T.decodeUtf8 bs
, bench "Stream" $ nf decodeStream lbs
, bench "IConv" $ whnfIO $ iconv bs
, bench "StrictLength" $ nf (T.length . T.decodeUtf8) bs
, bench "StrictInitLength" $ nf (T.length . T.init . T.decodeUtf8) bs
, bench "Lazy" $ nf TL.decodeUtf8 lbs
, bench "LazyLength" $ nf (TL.length . TL.decodeUtf8) lbs
, bench "LazyInitLength" $ nf (TL.length . TL.init . TL.decodeUtf8) lbs
, bench "StrictStringUtf8" $ nf U8.toString bs
, bench "StrictStringUtf8Length" $ nf (length . U8.toString) bs
, bench "LazyStringUtf8" $ nf U8.toString lbs
, bench "LazyStringUtf8Length" $ nf (length . U8.toString) lbs
]
iconv :: B.ByteString -> IO CInt
iconv (PS fp off len) = withForeignPtr fp $ \ptr ->
time_iconv (ptr `plusPtr` off) (fromIntegral len)
foreign import ccall unsafe time_iconv :: Ptr Word8 -> CSize -> IO CInt

View File

@ -1,33 +0,0 @@
-- | UTF-8 encode a text
--
-- Tested in this benchmark:
--
-- * Replicating a string a number of times
--
-- * UTF-8 encoding it
--
module Benchmarks.EncodeUtf8
( benchmark
) where
import Criterion (Benchmark, bgroup, bench, whnf)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TL
benchmark :: String -> Benchmark
benchmark string =
bgroup "EncodeUtf8"
[ bench "Text" $ whnf (B.length . T.encodeUtf8) text
, bench "LazyText" $ whnf (BL.length . TL.encodeUtf8) lazyText
]
where
-- The string in different formats
text = T.replicate k $ T.pack string
lazyText = TL.replicate (fromIntegral k) $ TL.pack string
-- Amount
k = 100000

View File

@ -1,43 +0,0 @@
-- | Compare a string with a copy of itself that is identical except
-- for the last character.
--
-- Tested in this benchmark:
--
-- * Comparison of strings (Eq instance)
--
module Benchmarks.Equality
( initEnv
, benchmark
) where
import Criterion (Benchmark, bgroup, bench, whnf)
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TL
type Env = (T.Text, TL.Text, B.ByteString, BL.ByteString, BL.ByteString, String)
initEnv :: FilePath -> IO Env
initEnv fp = do
b <- B.readFile fp
bl1 <- BL.readFile fp
-- A lazy bytestring is a list of chunks. When we do not explicitly create two
-- different lazy bytestrings at a different address, the bytestring library
-- will compare the chunk addresses instead of the chunk contents. This is why
-- we read the lazy bytestring twice here.
bl2 <- BL.readFile fp
l <- readFile fp
return (T.decodeUtf8 b, TL.decodeUtf8 bl1, b, bl1, bl2, l)
benchmark :: Env -> Benchmark
benchmark ~(t, tl, b, bl1, bl2, l) =
bgroup "Equality"
[ bench "Text" $ whnf (== T.init t `T.snoc` '\xfffd') t
, bench "LazyText" $ whnf (== TL.init tl `TL.snoc` '\xfffd') tl
, bench "ByteString" $ whnf (== B.init b `B.snoc` '\xfffd') b
, bench "LazyByteString" $ whnf (== BL.init bl2 `BL.snoc` '\xfffd') bl1
, bench "String" $ whnf (== init l ++ "\xfffd") l
]

View File

@ -1,33 +0,0 @@
-- | Benchmarks simple file reading
--
-- Tested in this benchmark:
--
-- * Reading a file from the disk
--
module Benchmarks.FileRead
( benchmark
) where
import Control.Applicative ((<$>))
import Criterion (Benchmark, bgroup, bench, whnfIO)
import qualified Data.ByteString as SB
import qualified Data.ByteString.Lazy as LB
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as LT
import qualified Data.Text.Lazy.Encoding as LT
import qualified Data.Text.Lazy.IO as LT
benchmark :: FilePath -> Benchmark
benchmark p = bgroup "FileRead"
[ bench "String" $ whnfIO $ length <$> readFile p
, bench "ByteString" $ whnfIO $ SB.length <$> SB.readFile p
, bench "LazyByteString" $ whnfIO $ LB.length <$> LB.readFile p
, bench "Text" $ whnfIO $ T.length <$> T.readFile p
, bench "LazyText" $ whnfIO $ LT.length <$> LT.readFile p
, bench "TextByteString" $ whnfIO $
(T.length . T.decodeUtf8) <$> SB.readFile p
, bench "LazyTextByteString" $ whnfIO $
(LT.length . LT.decodeUtf8) <$> LB.readFile p
]

View File

@ -1,58 +0,0 @@
-- | Read a file line-by-line using handles, and perform a fold over the lines.
-- The fold is used here to calculate the number of lines in the file.
--
-- Tested in this benchmark:
--
-- * Buffered, line-based IO
--
{-# LANGUAGE BangPatterns #-}
module Benchmarks.FoldLines
( benchmark
) where
import Criterion (Benchmark, bgroup, bench, whnfIO)
import System.IO
import qualified Data.ByteString as B
import qualified Data.Text as T
import qualified Data.Text.IO as T
benchmark :: FilePath -> Benchmark
benchmark fp = bgroup "ReadLines"
[ bench "Text" $ withHandle $ foldLinesT (\n _ -> n + 1) (0 :: Int)
, bench "ByteString" $ withHandle $ foldLinesB (\n _ -> n + 1) (0 :: Int)
]
where
withHandle f = whnfIO $ do
h <- openFile fp ReadMode
hSetBuffering h (BlockBuffering (Just 16384))
x <- f h
hClose h
return x
-- | Text line fold
--
foldLinesT :: (a -> T.Text -> a) -> a -> Handle -> IO a
foldLinesT f z0 h = go z0
where
go !z = do
eof <- hIsEOF h
if eof
then return z
else do
l <- T.hGetLine h
let z' = f z l in go z'
{-# INLINE foldLinesT #-}
-- | ByteString line fold
--
foldLinesB :: (a -> B.ByteString -> a) -> a -> Handle -> IO a
foldLinesB f z0 h = go z0
where
go !z = do
eof <- hIsEOF h
if eof
then return z
else do
l <- B.hGetLine h
let z' = f z l in go z'
{-# INLINE foldLinesB #-}

View File

@ -1,146 +0,0 @@
module Benchmarks.Mul
( initEnv
, benchmark
) where
import Control.Exception (evaluate)
import Criterion.Main
import Data.Int (Int32, Int64)
import Data.Text.Internal (mul32, mul64)
import qualified Data.Vector.Unboxed as U
oldMul :: Int64 -> Int64 -> Int64
oldMul m n
| n == 0 = 0
| m <= maxBound `quot` n = m * n
| otherwise = error "overflow"
type Env = (U.Vector (Int32,Int32), U.Vector (Int64,Int64))
initEnv :: IO Env
initEnv = do
x <- evaluate testVector32
y <- evaluate testVector64
return (x, y)
benchmark :: Env -> Benchmark
benchmark ~(tv32, tv64) = bgroup "Mul"
[ bench "oldMul" $ whnf (U.map (uncurry oldMul)) tv64
, bench "mul64" $ whnf (U.map (uncurry mul64)) tv64
, bench "*64" $ whnf (U.map (uncurry (*))) tv64
, bench "mul32" $ whnf (U.map (uncurry mul32)) tv32
, bench "*32" $ whnf (U.map (uncurry (*))) tv32
]
testVector64 :: U.Vector (Int64,Int64)
testVector64 = U.fromList [
(0,1248868987182846646),(169004623633872,24458),(482549039517835,7614),
(372,8157063115504364),(27,107095594861148252),(3,63249878517962420),
(4363,255694473572912),(86678474,1732634806),(1572453024,1800489338),
(9384523143,77053781),(49024709555,75095046),(7,43457620410239131),
(8,8201563008844571),(387719037,1520696708),(189869238220197,1423),
(46788016849611,23063),(503077742109974359,0),(104,1502010908706487),
(30478140346,207525518),(80961140129236192,14),(4283,368012829143675),
(1028719181728108146,6),(318904,5874863049591),(56724427166898,110794),
(234539368,31369110449),(2,251729663598178612),(103291548194451219,5),
(76013,5345328755566),(1769631,2980846129318),(40898,60598477385754),
(0,98931348893227155),(573555872156917492,3),(318821187115,4476566),
(11152874213584,243582),(40274276,16636653248),(127,4249988676030597),
(103543712111871836,5),(71,16954462148248238),(3963027173504,216570),
(13000,503523808916753),(17038308,20018685905),(0,510350226577891549),
(175898,3875698895405),(425299191292676,5651),(17223451323664536,50),
(61755131,14247665326),(0,1018195131697569303),(36433751497238985,20),
(3473607861601050,1837),(1392342328,1733971838),(225770297367,3249655),
(14,127545244155254102),(1751488975299136,2634),(3949208,504190668767),
(153329,831454434345),(1066212122928663658,2),(351224,2663633539556),
(344565,53388869217),(35825609350446863,54),(276011553660081475,10),
(1969754174790470349,3),(35,68088438338633),(506710,3247689556438),
(11099382291,327739909),(105787303549,32824363),(210366111,14759049409),
(688893241579,3102676),(8490,70047474429581),(152085,29923000251880),
(5046974599257095,400),(4183167795,263434071),(10089728,502781960687),
(44831977765,4725378),(91,8978094664238578),(30990165721,44053350),
(1772377,149651820860),(243420621763408572,4),(32,5790357453815138),
(27980806337993771,5),(47696295759774,20848),(1745874142313778,1098),
(46869334770121,1203),(886995283,1564424789),(40679396544,76002479),
(1,672849481568486995),(337656187205,3157069),(816980552858963,6003),
(2271434085804831543,1),(0,1934521023868747186),(6266220038281,15825),
(4160,107115946987394),(524,246808621791561),(0,1952519482439636339),
(128,2865935904539691),(1044,3211982069426297),(16000511542473,88922),
(1253596745404082,2226),(27041,56836278958002),(23201,49247489754471),
(175906590497,21252392),(185163584757182295,24),(34742225226802197,150),
(2363228,250824838408),(216327527109550,45),(24,81574076994520675),
(28559899906542,15356),(10890139774837133,511),(2293,707179303654492),
(2749366833,40703233),(0,4498229704622845986),(439,4962056468281937),
(662,1453820621089921),(16336770612459631,220),(24282989393,74239137),
(2724564648490195,3),(743672760,124992589),(4528103,704330948891),
(6050483122491561,250),(13322953,13594265152),(181794,22268101450214),
(25957941712,75384092),(43352,7322262295009),(32838,52609059549923),
(33003585202001564,2),(103019,68430142267402),(129918230800,8742978),
(0,2114347379589080688),(2548,905723041545274),(222745067962838382,0),
(1671683850790425181,1),(455,4836932776795684),(794227702827214,6620),
(212534135175874,1365),(96432431858,29784975),(466626763743380,3484),
(29793949,53041519613),(8359,309952753409844),(3908960585331901,26),
(45185288970365760,114),(10131829775,68110174),(58039242399640479,83),
(628092278238719399,6),(1,196469106875361889),(302336625,16347502444),
(148,3748088684181047),(1,1649096568849015456),(1019866864,2349753026),
(8211344830,569363306),(65647579546873,34753),(2340190,1692053129069),
(64263301,30758930355),(48681618072372209,110),(7074794736,47640197),
(249634721521,7991792),(1162917363807215,232),(7446433349,420634045),
(63398619383,60709817),(51359004508011,14200),(131788797028647,7072),
(52079887791430043,7),(7,136277667582599838),(28582879735696,50327),
(1404582800566278,833),(469164435,15017166943),(99567079957578263,49),
(1015285971,3625801566),(321504843,4104079293),(5196954,464515406632),
(114246832260876,7468),(8149664437,487119673),(12265299,378168974869),
(37711995764,30766513),(3971137243,710996152),(483120070302,603162),
(103009942,61645547145),(8476344625340,6987),(547948761229739,1446),
(42234,18624767306301),(13486714173011,58948),(4,198309153268019840),
(9913176974,325539248),(28246225540203,116822),(2882463945582154,18),
(959,25504987505398),(3,1504372236378217710),(13505229956793,374987),
(751661959,457611342),(27375926,36219151769),(482168869,5301952074),
(1,1577425863241520640),(714116235611821,1164),(904492524250310488,0),
(5983514941763398,68),(10759472423,23540686),(72539568471529,34919),
(4,176090672310337473),(938702842110356453,1),(673652445,3335287382),
(3111998893666122,917),(1568013,3168419765469)]
testVector32 :: U.Vector (Int32,Int32)
testVector32 = U.fromList [
(39242,410),(0,100077553),(2206,9538),(509400240,1),(38048,6368),
(1789,651480),(2399,157032),(701,170017),(5241456,14),(11212,70449),
(1,227804876),(749687254,1),(74559,2954),(1158,147957),(410604456,1),
(170851,1561),(92643422,1),(6192,180509),(7,24202210),(3440,241481),
(5753677,5),(294327,1622),(252,4454673),(127684121,11),(28315800,30),
(340370905,0),(1,667887987),(592782090,1),(49023,27641),(750,290387),
(72886,3847),(0,301047933),(3050276,473),(1,788366142),(59457,15813),
(637726933,1),(1135,344317),(853616,264),(696816,493),(7038,12046),
(125219574,4),(803694088,1),(107081726,1),(39294,21699),(16361,38191),
(132561123,12),(1760,23499),(847543,484),(175687349,1),(2963,252678),
(6248,224553),(27596,4606),(5422922,121),(1542,485890),(131,583035),
(59096,4925),(3637115,132),(0,947225435),(86854,6794),(2984745,339),
(760129569,1),(1,68260595),(380835652,2),(430575,2579),(54514,7211),
(15550606,3),(9,27367402),(3007053,207),(7060988,60),(28560,27130),
(1355,21087),(10880,53059),(14563646,4),(461886361,1),(2,169260724),
(241454126,2),(406797,1),(61631630,16),(44473,5943),(63869104,12),
(950300,1528),(2113,62333),(120817,9358),(100261456,1),(426764723,1),
(119,12723684),(3,53358711),(4448071,18),(1,230278091),(238,232102),
(8,57316440),(42437979,10),(6769,19555),(48590,22006),(11500585,79),
(2808,97638),(42,26952545),(11,32104194),(23954638,1),(785427272,0),
(513,81379),(31333960,37),(897772,1009),(4,25679692),(103027993,12),
(104972702,11),(546,443401),(7,65137092),(88574269,3),(872139069,0),
(2,97417121),(378802603,0),(141071401,4),(22613,10575),(2191743,118),
(470,116119),(7062,38166),(231056,1847),(43901963,9),(2400,70640),
(63553,1555),(34,11249573),(815174,1820),(997894011,0),(98881794,2),
(5448,43132),(27956,9),(904926,1357),(112608626,3),(124,613021),
(282086,1966),(99,10656881),(113799,1501),(433318,2085),(442,948171),
(165380,1043),(28,14372905),(14880,50462),(2386,219918),(229,1797565),
(1174961,298),(3925,41833),(3903515,299),(15690452,111),(360860521,3),
(7440846,81),(2541026,507),(0,492448477),(6869,82469),(245,8322939),
(3503496,253),(123495298,0),(150963,2299),(33,4408482),(1,200911107),
(305,252121),(13,123369189),(215846,8181),(2440,65387),(776764401,1),
(1241172,434),(8,15493155),(81953961,6),(17884993,5),(26,6893822),
(0,502035190),(1,582451018),(2,514870139),(227,3625619),(49,12720258),
(1456769,207),(94797661,10),(234407,893),(26843,5783),(15688,24547),
(4091,86268),(4339448,151),(21360,6294),(397046497,2),(1227,205936),
(9966,21959),(160046791,1),(0,159992224),(27,24974797),(19177,29334),
(4136148,42),(21179785,53),(61256583,31),(385,344176),(7,11934915),
(1,18992566),(3488065,5),(768021,224),(36288474,7),(8624,117561),
(8,20341439),(5903,261475),(561,1007618),(1738,392327),(633049,1708)]

View File

@ -1,42 +0,0 @@
-- | Create a large HTML table and dump it to a handle
--
-- Tested in this benchmark:
--
-- * Creating a large HTML document using a builder
--
-- * Writing to a handle
--
{-# LANGUAGE OverloadedStrings #-}
module Benchmarks.Programs.BigTable
( benchmark
) where
import Criterion (Benchmark, bench, whnfIO)
import Data.Monoid (mappend, mconcat)
import Data.Text.Lazy.Builder (Builder, fromText, toLazyText)
import Data.Text.Lazy.IO (hPutStr)
import System.IO (Handle)
import qualified Data.Text as T
benchmark :: Handle -> Benchmark
benchmark sink = bench "BigTable" $ whnfIO $ do
hPutStr sink "Content-Type: text/html\n\n<table>"
hPutStr sink . toLazyText . makeTable =<< rows
hPutStr sink "</table>"
where
-- We provide the number of rows in IO so the builder value isn't shared
-- between the benchmark samples.
rows :: IO Int
rows = return 20000
{-# NOINLINE rows #-}
makeTable :: Int -> Builder
makeTable n = mconcat $ replicate n $ mconcat $ map makeCol [1 .. 50]
makeCol :: Int -> Builder
makeCol 1 = fromText "<tr><td>1</td>"
makeCol 50 = fromText "<td>50</td></tr>"
makeCol i = fromText "<td>" `mappend` (fromInt i `mappend` fromText "</td>")
fromInt :: Int -> Builder
fromInt = fromText . T.pack . show

View File

@ -1,98 +0,0 @@
-- | Cut into a file, selecting certain columns (e.g. columns 10 to 40)
--
-- Tested in this benchmark:
--
-- * Reading the file
--
-- * Splitting into lines
--
-- * Taking a number of characters from the lines
--
-- * Joining the lines
--
-- * Writing back to a handle
--
module Benchmarks.Programs.Cut
( benchmark
) where
import Criterion (Benchmark, bgroup, bench, whnfIO)
import System.IO (Handle, hPutStr)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Char8 as BLC
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TL
import qualified Data.Text.Lazy.IO as TL
benchmark :: FilePath -> Handle -> Int -> Int -> Benchmark
benchmark p sink from to = bgroup "Cut"
[ bench' "String" string
, bench' "ByteString" byteString
, bench' "LazyByteString" lazyByteString
, bench' "Text" text
, bench' "LazyText" lazyText
, bench' "TextByteString" textByteString
, bench' "LazyTextByteString" lazyTextByteString
]
where
bench' n s = bench n $ whnfIO (s p sink from to)
string :: FilePath -> Handle -> Int -> Int -> IO ()
string fp sink from to = do
s <- readFile fp
hPutStr sink $ cut s
where
cut = unlines . map (take (to - from) . drop from) . lines
byteString :: FilePath -> Handle -> Int -> Int -> IO ()
byteString fp sink from to = do
bs <- B.readFile fp
B.hPutStr sink $ cut bs
where
cut = BC.unlines . map (B.take (to - from) . B.drop from) . BC.lines
lazyByteString :: FilePath -> Handle -> Int -> Int -> IO ()
lazyByteString fp sink from to = do
bs <- BL.readFile fp
BL.hPutStr sink $ cut bs
where
cut = BLC.unlines . map (BL.take (to' - from') . BL.drop from') . BLC.lines
from' = fromIntegral from
to' = fromIntegral to
text :: FilePath -> Handle -> Int -> Int -> IO ()
text fp sink from to = do
t <- T.readFile fp
T.hPutStr sink $ cut t
where
cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines
lazyText :: FilePath -> Handle -> Int -> Int -> IO ()
lazyText fp sink from to = do
t <- TL.readFile fp
TL.hPutStr sink $ cut t
where
cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines
from' = fromIntegral from
to' = fromIntegral to
textByteString :: FilePath -> Handle -> Int -> Int -> IO ()
textByteString fp sink from to = do
t <- T.decodeUtf8 `fmap` B.readFile fp
B.hPutStr sink $ T.encodeUtf8 $ cut t
where
cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines
lazyTextByteString :: FilePath -> Handle -> Int -> Int -> IO ()
lazyTextByteString fp sink from to = do
t <- TL.decodeUtf8 `fmap` BL.readFile fp
BL.hPutStr sink $ TL.encodeUtf8 $ cut t
where
cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines
from' = fromIntegral from
to' = fromIntegral to

View File

@ -1,68 +0,0 @@
-- | Benchmark which formats paragraph, like the @sort@ unix utility.
--
-- Tested in this benchmark:
--
-- * Reading the file
--
-- * Splitting into paragraphs
--
-- * Reformatting the paragraphs to a certain line width
--
-- * Concatenating the results using the text builder
--
-- * Writing back to a handle
--
{-# LANGUAGE OverloadedStrings #-}
module Benchmarks.Programs.Fold
( benchmark
) where
import Data.List (foldl')
import Data.List (intersperse)
import Data.Monoid (mempty, mappend, mconcat)
import System.IO (Handle)
import Criterion (Benchmark, bench, whnfIO)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy.Builder as TLB
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TL
benchmark :: FilePath -> Handle -> Benchmark
benchmark i o =
bench "Fold" $ whnfIO $ T.readFile i >>= TL.hPutStr o . fold 80
-- | We represent a paragraph by a word list
--
type Paragraph = [T.Text]
-- | Fold a text
--
fold :: Int -> T.Text -> TL.Text
fold maxWidth = TLB.toLazyText . mconcat .
intersperse "\n\n" . map (foldParagraph maxWidth) . paragraphs
-- | Fold a paragraph
--
foldParagraph :: Int -> Paragraph -> TLB.Builder
foldParagraph _ [] = mempty
foldParagraph max' (w : ws) = fst $ foldl' go (TLB.fromText w, T.length w) ws
where
go (builder, width) word
| width + len + 1 <= max' =
(builder `mappend` " " `mappend` word', width + len + 1)
| otherwise =
(builder `mappend` "\n" `mappend` word', len)
where
word' = TLB.fromText word
len = T.length word
-- | Divide a text into paragraphs
--
paragraphs :: T.Text -> [Paragraph]
paragraphs = splitParagraphs . map T.words . T.lines
where
splitParagraphs ls = case break null ls of
([], []) -> []
(p, []) -> [concat p]
(p, lr) -> concat p : splitParagraphs (dropWhile null lr)

View File

@ -1,71 +0,0 @@
-- | This benchmark sorts the lines of a file, like the @sort@ unix utility.
--
-- Tested in this benchmark:
--
-- * Reading the file
--
-- * Splitting into lines
--
-- * Sorting the lines
--
-- * Joining the lines
--
-- * Writing back to a handle
--
{-# LANGUAGE OverloadedStrings #-}
module Benchmarks.Programs.Sort
( benchmark
) where
import Criterion (Benchmark, bgroup, bench, whnfIO)
import Data.Monoid (mconcat)
import System.IO (Handle, hPutStr)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Char8 as BLC
import qualified Data.List as L
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Builder as TLB
import qualified Data.Text.Lazy.Encoding as TL
import qualified Data.Text.Lazy.IO as TL
benchmark :: FilePath -> Handle -> Benchmark
benchmark i o = bgroup "Sort"
[ bench "String" $ whnfIO $ readFile i >>= hPutStr o . string
, bench "ByteString" $ whnfIO $ B.readFile i >>= B.hPutStr o . byteString
, bench "LazyByteString" $ whnfIO $
BL.readFile i >>= BL.hPutStr o . lazyByteString
, bench "Text" $ whnfIO $ T.readFile i >>= T.hPutStr o . text
, bench "LazyText" $ whnfIO $ TL.readFile i >>= TL.hPutStr o . lazyText
, bench "TextByteString" $ whnfIO $ B.readFile i >>=
B.hPutStr o . T.encodeUtf8 . text . T.decodeUtf8
, bench "LazyTextByteString" $ whnfIO $ BL.readFile i >>=
BL.hPutStr o . TL.encodeUtf8 . lazyText . TL.decodeUtf8
, bench "TextBuilder" $ whnfIO $ B.readFile i >>=
BL.hPutStr o . TL.encodeUtf8 . textBuilder . T.decodeUtf8
]
string :: String -> String
string = unlines . L.sort . lines
byteString :: B.ByteString -> B.ByteString
byteString = BC.unlines . L.sort . BC.lines
lazyByteString :: BL.ByteString -> BL.ByteString
lazyByteString = BLC.unlines . L.sort . BLC.lines
text :: T.Text -> T.Text
text = T.unlines . L.sort . T.lines
lazyText :: TL.Text -> TL.Text
lazyText = TL.unlines . L.sort . TL.lines
-- | Text variant using a builder monoid for the final concatenation
--
textBuilder :: T.Text -> TL.Text
textBuilder = TLB.toLazyText . mconcat . L.intersperse (TLB.singleton '\n') .
map TLB.fromText . L.sort . T.lines

View File

@ -1,53 +0,0 @@
-- | Program to replace HTML tags by whitespace
--
-- This program was originally contributed by Petr Prokhorenkov.
--
-- Tested in this benchmark:
--
-- * Reading the file
--
-- * Replacing text between HTML tags (<>) with whitespace
--
-- * Writing back to a handle
--
{-# OPTIONS_GHC -fspec-constr-count=5 #-}
module Benchmarks.Programs.StripTags
( benchmark
) where
import Criterion (Benchmark, bgroup, bench, whnfIO)
import Data.List (mapAccumL)
import System.IO (Handle, hPutStr)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
benchmark :: FilePath -> Handle -> Benchmark
benchmark i o = bgroup "StripTags"
[ bench "String" $ whnfIO $ readFile i >>= hPutStr o . string
, bench "ByteString" $ whnfIO $ B.readFile i >>= B.hPutStr o . byteString
, bench "Text" $ whnfIO $ T.readFile i >>= T.hPutStr o . text
, bench "TextByteString" $ whnfIO $
B.readFile i >>= B.hPutStr o . T.encodeUtf8 . text . T.decodeUtf8
]
string :: String -> String
string = snd . mapAccumL step 0
text :: T.Text -> T.Text
text = snd . T.mapAccumL step 0
byteString :: B.ByteString -> B.ByteString
byteString = snd . BC.mapAccumL step 0
step :: Int -> Char -> (Int, Char)
step d c
| d > 0 || d' > 0 = (d', ' ')
| otherwise = (d', c)
where
d' = d + depth c
depth '>' = 1
depth '<' = -1
depth _ = 0

View File

@ -1,41 +0,0 @@
-- | This benchmark simply reads and writes a file using the various string
-- libraries. The point of it is that we can make better estimations on how
-- much time the other benchmarks spend doing IO.
--
-- Note that we expect ByteStrings to be a whole lot faster, since they do not
-- do any actual encoding/decoding here, while String and Text do have UTF-8
-- encoding/decoding.
--
-- Tested in this benchmark:
--
-- * Reading the file
--
-- * Replacing text between HTML tags (<>) with whitespace
--
-- * Writing back to a handle
--
module Benchmarks.Programs.Throughput
( benchmark
) where
import Criterion (Benchmark, bgroup, bench, whnfIO)
import System.IO (Handle, hPutStr)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy.Encoding as TL
import qualified Data.Text.Lazy.IO as TL
benchmark :: FilePath -> Handle -> Benchmark
benchmark fp sink = bgroup "Throughput"
[ bench "String" $ whnfIO $ readFile fp >>= hPutStr sink
, bench "ByteString" $ whnfIO $ B.readFile fp >>= B.hPutStr sink
, bench "LazyByteString" $ whnfIO $ BL.readFile fp >>= BL.hPutStr sink
, bench "Text" $ whnfIO $ T.readFile fp >>= T.hPutStr sink
, bench "LazyText" $ whnfIO $ TL.readFile fp >>= TL.hPutStr sink
, bench "TextByteString" $ whnfIO $
B.readFile fp >>= B.hPutStr sink . T.encodeUtf8 . T.decodeUtf8
, bench "LazyTextByteString" $ whnfIO $
BL.readFile fp >>= BL.hPutStr sink . TL.encodeUtf8 . TL.decodeUtf8
]

View File

@ -1,518 +0,0 @@
-- | Benchmarks various pure functions from the Text library
--
-- Tested in this benchmark:
--
-- * Most pure functions defined the string types
--
{-# LANGUAGE BangPatterns, CPP, GADTs, MagicHash #-}
{-# LANGUAGE DeriveAnyClass, DeriveGeneric, RecordWildCards #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Benchmarks.Pure
( initEnv
, benchmark
) where
import Control.DeepSeq (NFData (..))
import Control.Exception (evaluate)
import Criterion (Benchmark, bgroup, bench, nf)
import Data.Char (toLower, toUpper)
import Data.Monoid (mappend, mempty)
import GHC.Base (Char (..), Int (..), chr#, ord#, (+#))
import GHC.Generics (Generic)
import GHC.Int (Int64)
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.ByteString.UTF8 as UTF8
import qualified Data.List as L
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Builder as TB
import qualified Data.Text.Lazy.Encoding as TL
data Env = Env
{ bsa :: !BS.ByteString
, ta :: !T.Text
, tb :: !T.Text
, tla :: !TL.Text
, tlb :: !TL.Text
, bsb :: !BS.ByteString
, bla :: !BL.ByteString
, blb :: !BL.ByteString
, sa :: !String
, sb :: !String
, bsa_len :: !Int
, ta_len :: !Int
, bla_len :: !Int64
, tla_len :: !Int64
, sa_len :: !Int
, bsl :: [BS.ByteString]
, bll :: [BL.ByteString]
, tl :: [T.Text]
, tll :: [TL.Text]
, sl :: [String]
} deriving (Generic, NFData)
initEnv :: FilePath -> IO Env
initEnv fp = do
-- Evaluate stuff before actually running the benchmark, we don't want to
-- count it here.
-- ByteString A
bsa <- BS.readFile fp
-- Text A/B, LazyText A/B
ta <- evaluate $ T.decodeUtf8 bsa
tb <- evaluate $ T.toUpper ta
tla <- evaluate $ TL.fromChunks (T.chunksOf 16376 ta)
tlb <- evaluate $ TL.fromChunks (T.chunksOf 16376 tb)
-- ByteString B, LazyByteString A/B
bsb <- evaluate $ T.encodeUtf8 tb
bla <- evaluate $ BL.fromChunks (chunksOf 16376 bsa)
blb <- evaluate $ BL.fromChunks (chunksOf 16376 bsb)
-- String A/B
sa <- evaluate $ UTF8.toString bsa
sb <- evaluate $ T.unpack tb
-- Lengths
bsa_len <- evaluate $ BS.length bsa
ta_len <- evaluate $ T.length ta
bla_len <- evaluate $ BL.length bla
tla_len <- evaluate $ TL.length tla
sa_len <- evaluate $ L.length sa
-- Lines
bsl <- evaluate $ BS.lines bsa
bll <- evaluate $ BL.lines bla
tl <- evaluate $ T.lines ta
tll <- evaluate $ TL.lines tla
sl <- evaluate $ L.lines sa
return Env{..}
benchmark :: String -> Env -> Benchmark
benchmark kind ~Env{..} =
bgroup "Pure"
[ bgroup "append"
[ benchT $ nf (T.append tb) ta
, benchTL $ nf (TL.append tlb) tla
, benchBS $ nf (BS.append bsb) bsa
, benchBSL $ nf (BL.append blb) bla
, benchS $ nf ((++) sb) sa
]
, bgroup "concat"
[ benchT $ nf T.concat tl
, benchTL $ nf TL.concat tll
, benchBS $ nf BS.concat bsl
, benchBSL $ nf BL.concat bll
, benchS $ nf L.concat sl
]
, bgroup "cons"
[ benchT $ nf (T.cons c) ta
, benchTL $ nf (TL.cons c) tla
, benchBS $ nf (BS.cons c) bsa
, benchBSL $ nf (BL.cons c) bla
, benchS $ nf (c:) sa
]
, bgroup "concatMap"
[ benchT $ nf (T.concatMap (T.replicate 3 . T.singleton)) ta
, benchTL $ nf (TL.concatMap (TL.replicate 3 . TL.singleton)) tla
, benchBS $ nf (BS.concatMap (BS.replicate 3)) bsa
, benchBSL $ nf (BL.concatMap (BL.replicate 3)) bla
, benchS $ nf (L.concatMap (L.replicate 3 . (:[]))) sa
]
, bgroup "decode"
[ benchT $ nf T.decodeUtf8 bsa
, benchTL $ nf TL.decodeUtf8 bla
, benchBS $ nf BS.unpack bsa
, benchBSL $ nf BL.unpack bla
, benchS $ nf UTF8.toString bsa
]
, bgroup "decode'"
[ benchT $ nf T.decodeUtf8' bsa
, benchTL $ nf TL.decodeUtf8' bla
]
, bgroup "drop"
[ benchT $ nf (T.drop (ta_len `div` 3)) ta
, benchTL $ nf (TL.drop (tla_len `div` 3)) tla
, benchBS $ nf (BS.drop (bsa_len `div` 3)) bsa
, benchBSL $ nf (BL.drop (bla_len `div` 3)) bla
, benchS $ nf (L.drop (sa_len `div` 3)) sa
]
, bgroup "encode"
[ benchT $ nf T.encodeUtf8 ta
, benchTL $ nf TL.encodeUtf8 tla
, benchBS $ nf BS.pack sa
, benchBSL $ nf BL.pack sa
, benchS $ nf UTF8.fromString sa
]
, bgroup "filter"
[ benchT $ nf (T.filter p0) ta
, benchTL $ nf (TL.filter p0) tla
, benchBS $ nf (BS.filter p0) bsa
, benchBSL $ nf (BL.filter p0) bla
, benchS $ nf (L.filter p0) sa
]
, bgroup "filter.filter"
[ benchT $ nf (T.filter p1 . T.filter p0) ta
, benchTL $ nf (TL.filter p1 . TL.filter p0) tla
, benchBS $ nf (BS.filter p1 . BS.filter p0) bsa
, benchBSL $ nf (BL.filter p1 . BL.filter p0) bla
, benchS $ nf (L.filter p1 . L.filter p0) sa
]
, bgroup "foldl'"
[ benchT $ nf (T.foldl' len 0) ta
, benchTL $ nf (TL.foldl' len 0) tla
, benchBS $ nf (BS.foldl' len 0) bsa
, benchBSL $ nf (BL.foldl' len 0) bla
, benchS $ nf (L.foldl' len 0) sa
]
, bgroup "foldr"
[ benchT $ nf (L.length . T.foldr (:) []) ta
, benchTL $ nf (L.length . TL.foldr (:) []) tla
, benchBS $ nf (L.length . BS.foldr (:) []) bsa
, benchBSL $ nf (L.length . BL.foldr (:) []) bla
, benchS $ nf (L.length . L.foldr (:) []) sa
]
, bgroup "head"
[ benchT $ nf T.head ta
, benchTL $ nf TL.head tla
, benchBS $ nf BS.head bsa
, benchBSL $ nf BL.head bla
, benchS $ nf L.head sa
]
, bgroup "init"
[ benchT $ nf T.init ta
, benchTL $ nf TL.init tla
, benchBS $ nf BS.init bsa
, benchBSL $ nf BL.init bla
, benchS $ nf L.init sa
]
, bgroup "intercalate"
[ benchT $ nf (T.intercalate tsw) tl
, benchTL $ nf (TL.intercalate tlw) tll
, benchBS $ nf (BS.intercalate bsw) bsl
, benchBSL $ nf (BL.intercalate blw) bll
, benchS $ nf (L.intercalate lw) sl
]
, bgroup "intersperse"
[ benchT $ nf (T.intersperse c) ta
, benchTL $ nf (TL.intersperse c) tla
, benchBS $ nf (BS.intersperse c) bsa
, benchBSL $ nf (BL.intersperse c) bla
, benchS $ nf (L.intersperse c) sa
]
, bgroup "isInfixOf"
[ benchT $ nf (T.isInfixOf tsw) ta
, benchTL $ nf (TL.isInfixOf tlw) tla
, benchBS $ nf (BS.isInfixOf bsw) bsa
-- no isInfixOf for lazy bytestrings
, benchS $ nf (L.isInfixOf lw) sa
]
, bgroup "last"
[ benchT $ nf T.last ta
, benchTL $ nf TL.last tla
, benchBS $ nf BS.last bsa
, benchBSL $ nf BL.last bla
, benchS $ nf L.last sa
]
, bgroup "map"
[ benchT $ nf (T.map f) ta
, benchTL $ nf (TL.map f) tla
, benchBS $ nf (BS.map f) bsa
, benchBSL $ nf (BL.map f) bla
, benchS $ nf (L.map f) sa
]
, bgroup "mapAccumL"
[ benchT $ nf (T.mapAccumL g 0) ta
, benchTL $ nf (TL.mapAccumL g 0) tla
, benchBS $ nf (BS.mapAccumL g 0) bsa
, benchBSL $ nf (BL.mapAccumL g 0) bla
, benchS $ nf (L.mapAccumL g 0) sa
]
, bgroup "mapAccumR"
[ benchT $ nf (T.mapAccumR g 0) ta
, benchTL $ nf (TL.mapAccumR g 0) tla
, benchBS $ nf (BS.mapAccumR g 0) bsa
, benchBSL $ nf (BL.mapAccumR g 0) bla
, benchS $ nf (L.mapAccumR g 0) sa
]
, bgroup "map.map"
[ benchT $ nf (T.map f . T.map f) ta
, benchTL $ nf (TL.map f . TL.map f) tla
, benchBS $ nf (BS.map f . BS.map f) bsa
, benchBSL $ nf (BL.map f . BL.map f) bla
, benchS $ nf (L.map f . L.map f) sa
]
, bgroup "replicate char"
[ benchT $ nf (T.replicate bsa_len) (T.singleton c)
, benchTL $ nf (TL.replicate (fromIntegral bsa_len)) (TL.singleton c)
, benchBS $ nf (BS.replicate bsa_len) c
, benchBSL $ nf (BL.replicate (fromIntegral bsa_len)) c
, benchS $ nf (L.replicate bsa_len) c
]
, bgroup "replicate string"
[ benchT $ nf (T.replicate (bsa_len `div` T.length tsw)) tsw
, benchTL $ nf (TL.replicate (fromIntegral bsa_len `div` TL.length tlw)) tlw
, benchS $ nf (replicat (bsa_len `div` T.length tsw)) lw
]
, bgroup "reverse"
[ benchT $ nf T.reverse ta
, benchTL $ nf TL.reverse tla
, benchBS $ nf BS.reverse bsa
, benchBSL $ nf BL.reverse bla
, benchS $ nf L.reverse sa
]
, bgroup "take"
[ benchT $ nf (T.take (ta_len `div` 3)) ta
, benchTL $ nf (TL.take (tla_len `div` 3)) tla
, benchBS $ nf (BS.take (bsa_len `div` 3)) bsa
, benchBSL $ nf (BL.take (bla_len `div` 3)) bla
, benchS $ nf (L.take (sa_len `div` 3)) sa
]
, bgroup "tail"
[ benchT $ nf T.tail ta
, benchTL $ nf TL.tail tla
, benchBS $ nf BS.tail bsa
, benchBSL $ nf BL.tail bla
, benchS $ nf L.tail sa
]
, bgroup "toLower"
[ benchT $ nf T.toLower ta
, benchTL $ nf TL.toLower tla
, benchBS $ nf (BS.map toLower) bsa
, benchBSL $ nf (BL.map toLower) bla
, benchS $ nf (L.map toLower) sa
]
, bgroup "toUpper"
[ benchT $ nf T.toUpper ta
, benchTL $ nf TL.toUpper tla
, benchBS $ nf (BS.map toUpper) bsa
, benchBSL $ nf (BL.map toUpper) bla
, benchS $ nf (L.map toUpper) sa
]
, bgroup "uncons"
[ benchT $ nf T.uncons ta
, benchTL $ nf TL.uncons tla
, benchBS $ nf BS.uncons bsa
, benchBSL $ nf BL.uncons bla
, benchS $ nf L.uncons sa
]
, bgroup "words"
[ benchT $ nf T.words ta
, benchTL $ nf TL.words tla
, benchBS $ nf BS.words bsa
, benchBSL $ nf BL.words bla
, benchS $ nf L.words sa
]
, bgroup "zipWith"
[ benchT $ nf (T.zipWith min tb) ta
, benchTL $ nf (TL.zipWith min tlb) tla
, benchBS $ nf (BS.zipWith min bsb) bsa
, benchBSL $ nf (BL.zipWith min blb) bla
, benchS $ nf (L.zipWith min sb) sa
]
, bgroup "length"
[ bgroup "cons"
[ benchT $ nf (T.length . T.cons c) ta
, benchTL $ nf (TL.length . TL.cons c) tla
, benchBS $ nf (BS.length . BS.cons c) bsa
, benchBSL $ nf (BL.length . BL.cons c) bla
, benchS $ nf (L.length . (:) c) sa
]
, bgroup "decode"
[ benchT $ nf (T.length . T.decodeUtf8) bsa
, benchTL $ nf (TL.length . TL.decodeUtf8) bla
, benchBS $ nf (L.length . BS.unpack) bsa
, benchBSL $ nf (L.length . BL.unpack) bla
, bench "StringUTF8" $ nf (L.length . UTF8.toString) bsa
]
, bgroup "drop"
[ benchT $ nf (T.length . T.drop (ta_len `div` 3)) ta
, benchTL $ nf (TL.length . TL.drop (tla_len `div` 3)) tla
, benchBS $ nf (BS.length . BS.drop (bsa_len `div` 3)) bsa
, benchBSL $ nf (BL.length . BL.drop (bla_len `div` 3)) bla
, benchS $ nf (L.length . L.drop (sa_len `div` 3)) sa
]
, bgroup "filter"
[ benchT $ nf (T.length . T.filter p0) ta
, benchTL $ nf (TL.length . TL.filter p0) tla
, benchBS $ nf (BS.length . BS.filter p0) bsa
, benchBSL $ nf (BL.length . BL.filter p0) bla
, benchS $ nf (L.length . L.filter p0) sa
]
, bgroup "filter.filter"
[ benchT $ nf (T.length . T.filter p1 . T.filter p0) ta
, benchTL $ nf (TL.length . TL.filter p1 . TL.filter p0) tla
, benchBS $ nf (BS.length . BS.filter p1 . BS.filter p0) bsa
, benchBSL $ nf (BL.length . BL.filter p1 . BL.filter p0) bla
, benchS $ nf (L.length . L.filter p1 . L.filter p0) sa
]
, bgroup "init"
[ benchT $ nf (T.length . T.init) ta
, benchTL $ nf (TL.length . TL.init) tla
, benchBS $ nf (BS.length . BS.init) bsa
, benchBSL $ nf (BL.length . BL.init) bla
, benchS $ nf (L.length . L.init) sa
]
, bgroup "intercalate"
[ benchT $ nf (T.length . T.intercalate tsw) tl
, benchTL $ nf (TL.length . TL.intercalate tlw) tll
, benchBS $ nf (BS.length . BS.intercalate bsw) bsl
, benchBSL $ nf (BL.length . BL.intercalate blw) bll
, benchS $ nf (L.length . L.intercalate lw) sl
]
, bgroup "intersperse"
[ benchT $ nf (T.length . T.intersperse c) ta
, benchTL $ nf (TL.length . TL.intersperse c) tla
, benchBS $ nf (BS.length . BS.intersperse c) bsa
, benchBSL $ nf (BL.length . BL.intersperse c) bla
, benchS $ nf (L.length . L.intersperse c) sa
]
, bgroup "map"
[ benchT $ nf (T.length . T.map f) ta
, benchTL $ nf (TL.length . TL.map f) tla
, benchBS $ nf (BS.length . BS.map f) bsa
, benchBSL $ nf (BL.length . BL.map f) bla
, benchS $ nf (L.length . L.map f) sa
]
, bgroup "map.map"
[ benchT $ nf (T.length . T.map f . T.map f) ta
, benchTL $ nf (TL.length . TL.map f . TL.map f) tla
, benchBS $ nf (BS.length . BS.map f . BS.map f) bsa
, benchS $ nf (L.length . L.map f . L.map f) sa
]
, bgroup "replicate char"
[ benchT $ nf (T.length . T.replicate bsa_len) (T.singleton c)
, benchTL $ nf (TL.length . TL.replicate (fromIntegral bsa_len)) (TL.singleton c)
, benchBS $ nf (BS.length . BS.replicate bsa_len) c
, benchBSL $ nf (BL.length . BL.replicate (fromIntegral bsa_len)) c
, benchS $ nf (L.length . L.replicate bsa_len) c
]
, bgroup "replicate string"
[ benchT $ nf (T.length . T.replicate (bsa_len `div` T.length tsw)) tsw
, benchTL $ nf (TL.length . TL.replicate (fromIntegral bsa_len `div` TL.length tlw)) tlw
, benchS $ nf (L.length . replicat (bsa_len `div` T.length tsw)) lw
]
, bgroup "take"
[ benchT $ nf (T.length . T.take (ta_len `div` 3)) ta
, benchTL $ nf (TL.length . TL.take (tla_len `div` 3)) tla
, benchBS $ nf (BS.length . BS.take (bsa_len `div` 3)) bsa
, benchBSL $ nf (BL.length . BL.take (bla_len `div` 3)) bla
, benchS $ nf (L.length . L.take (sa_len `div` 3)) sa
]
, bgroup "tail"
[ benchT $ nf (T.length . T.tail) ta
, benchTL $ nf (TL.length . TL.tail) tla
, benchBS $ nf (BS.length . BS.tail) bsa
, benchBSL $ nf (BL.length . BL.tail) bla
, benchS $ nf (L.length . L.tail) sa
]
, bgroup "toLower"
[ benchT $ nf (T.length . T.toLower) ta
, benchTL $ nf (TL.length . TL.toLower) tla
, benchBS $ nf (BS.length . BS.map toLower) bsa
, benchBSL $ nf (BL.length . BL.map toLower) bla
, benchS $ nf (L.length . L.map toLower) sa
]
, bgroup "toUpper"
[ benchT $ nf (T.length . T.toUpper) ta
, benchTL $ nf (TL.length . TL.toUpper) tla
, benchBS $ nf (BS.length . BS.map toUpper) bsa
, benchBSL $ nf (BL.length . BL.map toUpper) bla
, benchS $ nf (L.length . L.map toUpper) sa
]
, bgroup "words"
[ benchT $ nf (L.length . T.words) ta
, benchTL $ nf (L.length . TL.words) tla
, benchBS $ nf (L.length . BS.words) bsa
, benchBSL $ nf (L.length . BL.words) bla
, benchS $ nf (L.length . L.words) sa
]
, bgroup "zipWith"
[ benchT $ nf (T.length . T.zipWith min tb) ta
, benchTL $ nf (TL.length . TL.zipWith min tlb) tla
, benchBS $ nf (L.length . BS.zipWith min bsb) bsa
, benchBSL $ nf (L.length . BL.zipWith min blb) bla
, benchS $ nf (L.length . L.zipWith min sb) sa
]
]
, bgroup "Builder"
[ bench "mappend char" $ nf (TL.length . TB.toLazyText . mappendNChar 'a') 10000
, bench "mappend 8 char" $ nf (TL.length . TB.toLazyText . mappend8Char) 'a'
, bench "mappend text" $ nf (TL.length . TB.toLazyText . mappendNText short) 10000
]
]
where
benchS = bench ("String+" ++ kind)
benchT = bench ("Text+" ++ kind)
benchTL = bench ("LazyText+" ++ kind)
benchBS = bench ("ByteString+" ++ kind)
benchBSL = bench ("LazyByteString+" ++ kind)
c = 'й'
p0 = (== c)
p1 = (/= 'д')
lw = "право"
bsw = UTF8.fromString lw
blw = BL.fromChunks [bsw]
tsw = T.pack lw
tlw = TL.fromChunks [tsw]
f (C# c#) = C# (chr# (ord# c# +# 1#))
g (I# i#) (C# c#) = (I# (i# +# 1#), C# (chr# (ord# c# +# i#)))
len l _ = l + (1::Int)
replicat n = concat . L.replicate n
short = T.pack "short"
#if !MIN_VERSION_bytestring(0,10,0)
instance NFData BS.ByteString
instance NFData BL.ByteString where
rnf BL.Empty = ()
rnf (BL.Chunk _ ts) = rnf ts
#endif
data B where
B :: NFData a => a -> B
instance NFData B where
rnf (B b) = rnf b
-- | Split a bytestring in chunks
--
chunksOf :: Int -> BS.ByteString -> [BS.ByteString]
chunksOf k = go
where
go t = case BS.splitAt k t of
(a,b) | BS.null a -> []
| otherwise -> a : go b
-- | Append a character n times
--
mappendNChar :: Char -> Int -> TB.Builder
mappendNChar c n = go 0
where
go i
| i < n = TB.singleton c `mappend` go (i+1)
| otherwise = mempty
-- | Gives more opportunity for inlining and elimination of unnecesary
-- bounds checks.
--
mappend8Char :: Char -> TB.Builder
mappend8Char c = TB.singleton c `mappend` TB.singleton c `mappend`
TB.singleton c `mappend` TB.singleton c `mappend`
TB.singleton c `mappend` TB.singleton c `mappend`
TB.singleton c `mappend` TB.singleton c
-- | Append a text N times
--
mappendNText :: T.Text -> Int -> TB.Builder
mappendNText t n = go 0
where
go i
| i < n = TB.fromText t `mappend` go (i+1)
| otherwise = mempty

View File

@ -1,100 +0,0 @@
-- | Read numbers from a file with a just a number on each line, find the
-- minimum of those numbers. The file contains different kinds of numbers:
--
-- * Decimals
--
-- * Hexadecimals
--
-- * Floating point numbers
--
-- * Floating point numbers in scientific notation
--
-- The different benchmarks will only take into account the values they can
-- parse.
--
-- Tested in this benchmark:
--
-- * Lexing/parsing of different numerical types
--
module Benchmarks.ReadNumbers
( initEnv
, benchmark
) where
import Criterion (Benchmark, bgroup, bench, whnf)
import Data.List (foldl')
import Numeric (readDec, readFloat, readHex)
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.ByteString.Lex.Fractional as B
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TL
import qualified Data.Text.Lazy.Read as TL
import qualified Data.Text.Read as T
type Env = ([String], [T.Text], [TL.Text], [B.ByteString], [BL.ByteString])
initEnv :: FilePath -> IO Env
initEnv fp = do
-- Read all files into lines: string, text, lazy text, bytestring, lazy
-- bytestring
s <- lines `fmap` readFile fp
t <- T.lines `fmap` T.readFile fp
tl <- TL.lines `fmap` TL.readFile fp
b <- B.lines `fmap` B.readFile fp
bl <- BL.lines `fmap` BL.readFile fp
return (s, t, tl, b, bl)
benchmark :: Env -> Benchmark
benchmark ~(s, t, tl, b, bl) =
bgroup "ReadNumbers"
[ bench "DecimalString" $ whnf (int . string readDec) s
, bench "HexadecimalString" $ whnf (int . string readHex) s
, bench "DoubleString" $ whnf (double . string readFloat) s
, bench "DecimalText" $ whnf (int . text (T.signed T.decimal)) t
, bench "HexadecimalText" $ whnf (int . text (T.signed T.hexadecimal)) t
, bench "DoubleText" $ whnf (double . text T.double) t
, bench "RationalText" $ whnf (double . text T.rational) t
, bench "DecimalLazyText" $
whnf (int . text (TL.signed TL.decimal)) tl
, bench "HexadecimalLazyText" $
whnf (int . text (TL.signed TL.hexadecimal)) tl
, bench "DoubleLazyText" $
whnf (double . text TL.double) tl
, bench "RationalLazyText" $
whnf (double . text TL.rational) tl
, bench "DecimalByteString" $ whnf (int . byteString B.readInt) b
, bench "DoubleByteString" $ whnf (double . byteString B.readDecimal) b
, bench "DecimalLazyByteString" $
whnf (int . byteString BL.readInt) bl
]
where
-- Used for fixing types
int :: Int -> Int
int = id
double :: Double -> Double
double = id
string :: (Ord a, Num a) => (t -> [(a, t)]) -> [t] -> a
string reader = foldl' go 1000000
where
go z t = case reader t of [(n, _)] -> min n z
_ -> z
text :: (Ord a, Num a) => (t -> Either String (a,t)) -> [t] -> a
text reader = foldl' go 1000000
where
go z t = case reader t of Left _ -> z
Right (n, _) -> min n z
byteString :: (Ord a, Num a) => (t -> Maybe (a,t)) -> [t] -> a
byteString reader = foldl' go 1000000
where
go z t = case reader t of Nothing -> z
Just (n, _) -> min n z

View File

@ -1,50 +0,0 @@
{-# LANGUAGE BangPatterns #-}
-- | Replace a string by another string
--
-- Tested in this benchmark:
--
-- * Search and replace of a pattern in a text
--
module Benchmarks.Replace
( benchmark
, initEnv
) where
import Criterion (Benchmark, bgroup, bench, nf)
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Search as BL
import qualified Data.ByteString.Search as B
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TL
import qualified Data.Text.Lazy.IO as TL
type Env = (T.Text, B.ByteString, TL.Text, BL.ByteString)
initEnv :: FilePath -> IO Env
initEnv fp = do
tl <- TL.readFile fp
bl <- BL.readFile fp
let !t = TL.toStrict tl
!b = T.encodeUtf8 t
return (t, b, tl, bl)
benchmark :: String -> String -> Env -> Benchmark
benchmark pat sub ~(t, b, tl, bl) =
bgroup "Replace" [
bench "Text" $ nf (T.length . T.replace tpat tsub) t
, bench "ByteString" $ nf (BL.length . B.replace bpat bsub) b
, bench "LazyText" $ nf (TL.length . TL.replace tlpat tlsub) tl
, bench "LazyByteString" $ nf (BL.length . BL.replace blpat blsub) bl
]
where
tpat = T.pack pat
tsub = T.pack sub
tlpat = TL.pack pat
tlsub = TL.pack sub
bpat = T.encodeUtf8 tpat
bsub = T.encodeUtf8 tsub
blpat = B.concat $ BL.toChunks $ TL.encodeUtf8 tlpat
blsub = B.concat $ BL.toChunks $ TL.encodeUtf8 tlsub

View File

@ -1,55 +0,0 @@
-- | Search for a pattern in a file, find the number of occurences
--
-- Tested in this benchmark:
--
-- * Searching all occurences of a pattern using library routines
--
module Benchmarks.Search
( initEnv
, benchmark
) where
import Criterion (Benchmark, bench, bgroup, whnf)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Search as BL
import qualified Data.ByteString.Search as B
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TL
type Env = (B.ByteString, BL.ByteString, T.Text, TL.Text)
initEnv :: FilePath -> IO Env
initEnv fp = do
b <- B.readFile fp
bl <- BL.readFile fp
t <- T.readFile fp
tl <- TL.readFile fp
return (b, bl, t, tl)
benchmark :: T.Text -> Env -> Benchmark
benchmark needleT ~(b, bl, t, tl) =
bgroup "FileIndices"
[ bench "ByteString" $ whnf (byteString needleB) b
, bench "LazyByteString" $ whnf (lazyByteString needleB) bl
, bench "Text" $ whnf (text needleT) t
, bench "LazyText" $ whnf (lazyText needleTL) tl
]
where
needleB = T.encodeUtf8 needleT
needleTL = TL.fromChunks [needleT]
byteString :: B.ByteString -> B.ByteString -> Int
byteString needle = length . B.indices needle
lazyByteString :: B.ByteString -> BL.ByteString -> Int
lazyByteString needle = length . BL.indices needle
text :: T.Text -> T.Text -> Int
text = T.count
lazyText :: TL.Text -> TL.Text -> Int
lazyText needle = fromIntegral . TL.count needle

View File

@ -1,118 +0,0 @@
-- | This module contains a number of benchmarks for the different streaming
-- functions
--
-- Tested in this benchmark:
--
-- * Most streaming functions
--
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveAnyClass, DeriveGeneric, RecordWildCards #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Benchmarks.Stream
( initEnv
, benchmark
) where
import Control.DeepSeq (NFData (..))
import Criterion (Benchmark, bgroup, bench, nf)
import qualified Data.Text as T
import qualified Data.ByteString as B
import qualified Data.Text.Lazy as TL
import qualified Data.ByteString.Lazy as BL
import Data.Text.Internal.Fusion.Types (Step (..), Stream (..))
import qualified Data.Text.Encoding as T
import qualified Data.Text.Encoding.Error as E
import qualified Data.Text.Internal.Encoding.Fusion as T
import qualified Data.Text.Internal.Encoding.Fusion.Common as F
import qualified Data.Text.Internal.Fusion as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy.Encoding as TL
import qualified Data.Text.Internal.Lazy.Encoding.Fusion as TL
import qualified Data.Text.Internal.Lazy.Fusion as TL
import qualified Data.Text.Lazy.IO as TL
import GHC.Generics (Generic)
instance NFData a => NFData (Stream a) where
-- Currently, this implementation does not force evaluation of the size hint
rnf (Stream next s0 _) = go s0
where
go !s = case next s of
Done -> ()
Skip s' -> go s'
Yield x s' -> rnf x `seq` go s'
data Env = Env
{ t :: !T.Text
, utf8 :: !B.ByteString
, utf16le :: !B.ByteString
, utf16be :: !B.ByteString
, utf32le :: !B.ByteString
, utf32be :: !B.ByteString
, tl :: !TL.Text
, utf8L :: !BL.ByteString
, utf16leL :: !BL.ByteString
, utf16beL :: !BL.ByteString
, utf32leL :: !BL.ByteString
, utf32beL :: !BL.ByteString
, s :: T.Stream Char
} deriving (Generic, NFData)
initEnv :: FilePath -> IO Env
initEnv fp = do
-- Different formats
t <- T.readFile fp
let !utf8 = T.encodeUtf8 t
!utf16le = T.encodeUtf16LE t
!utf16be = T.encodeUtf16BE t
!utf32le = T.encodeUtf32LE t
!utf32be = T.encodeUtf32BE t
-- Once again for the lazy variants
tl <- TL.readFile fp
let !utf8L = TL.encodeUtf8 tl
!utf16leL = TL.encodeUtf16LE tl
!utf16beL = TL.encodeUtf16BE tl
!utf32leL = TL.encodeUtf32LE tl
!utf32beL = TL.encodeUtf32BE tl
-- For the functions which operate on streams
let !s = T.stream t
return Env{..}
benchmark :: Env -> Benchmark
benchmark ~Env{..} =
bgroup "Stream"
-- Fusion
[ bgroup "stream" $
[ bench "Text" $ nf T.stream t
, bench "LazyText" $ nf TL.stream tl
]
-- Encoding.Fusion
, bgroup "streamUtf8"
[ bench "Text" $ nf (T.streamUtf8 E.lenientDecode) utf8
, bench "LazyText" $ nf (TL.streamUtf8 E.lenientDecode) utf8L
]
, bgroup "streamUtf16LE"
[ bench "Text" $ nf (T.streamUtf16LE E.lenientDecode) utf16le
, bench "LazyText" $ nf (TL.streamUtf16LE E.lenientDecode) utf16leL
]
, bgroup "streamUtf16BE"
[ bench "Text" $ nf (T.streamUtf16BE E.lenientDecode) utf16be
, bench "LazyText" $ nf (TL.streamUtf16BE E.lenientDecode) utf16beL
]
, bgroup "streamUtf32LE"
[ bench "Text" $ nf (T.streamUtf32LE E.lenientDecode) utf32le
, bench "LazyText" $ nf (TL.streamUtf32LE E.lenientDecode) utf32leL
]
, bgroup "streamUtf32BE"
[ bench "Text" $ nf (T.streamUtf32BE E.lenientDecode) utf32be
, bench "LazyText" $ nf (TL.streamUtf32BE E.lenientDecode) utf32beL
]
-- Encoding.Fusion.Common
, bench "restreamUtf16LE" $ nf F.restreamUtf16LE s
, bench "restreamUtf16BE" $ nf F.restreamUtf16BE s
, bench "restreamUtf32LE" $ nf F.restreamUtf32LE s
, bench "restreamUtf32BE" $ nf F.restreamUtf32BE s
]

View File

@ -1,43 +0,0 @@
-- | A word frequency count using the different string types
--
-- Tested in this benchmark:
--
-- * Splitting into words
--
-- * Converting to lowercase
--
-- * Comparing: Eq/Ord instances
--
module Benchmarks.WordFrequencies
( initEnv
, benchmark
) where
import Criterion (Benchmark, bench, bgroup, whnf)
import Data.Char (toLower)
import Data.List (foldl')
import Data.Map (Map)
import qualified Data.ByteString.Char8 as B
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.IO as T
type Env = (String, B.ByteString, T.Text)
initEnv :: FilePath -> IO Env
initEnv fp = do
s <- readFile fp
b <- B.readFile fp
t <- T.readFile fp
return (s, b, t)
benchmark :: Env -> Benchmark
benchmark ~(s, b, t) =
bgroup "WordFrequencies"
[ bench "String" $ whnf (frequencies . words . map toLower) s
, bench "ByteString" $ whnf (frequencies . B.words . B.map toLower) b
, bench "Text" $ whnf (frequencies . T.words . T.toLower) t
]
frequencies :: Ord a => [a] -> Map a Int
frequencies = foldl' (\m k -> M.insertWith (+) k 1 m) M.empty

View File

@ -1,32 +0,0 @@
{-# LANGUAGE BangPatterns, OverloadedStrings, RankNTypes #-}
module Main (
main
) where
import Control.Monad (forM_)
import qualified Data.ByteString as B
import qualified Data.Text as Text
import Data.Text.Encoding (decodeUtf8)
import Data.Text (Text)
import System.IO (hFlush, stdout)
import Timer (timer)
type BM = Text -> ()
bm :: forall a. (Text -> a) -> BM
bm f t = f t `seq` ()
benchmarks :: [(String, Text.Text -> ())]
benchmarks = [
("find_first", bm $ Text.isInfixOf "en:Benin")
, ("find_index", bm $ Text.findIndex (=='c'))
]
main :: IO ()
main = do
!contents <- decodeUtf8 `fmap` B.readFile "../tests/text-test-data/yiwiki.xml"
forM_ benchmarks $ \(name, bmark) -> do
putStr $ name ++ " "
hFlush stdout
putStrLn =<< (timer 100 contents bmark)

View File

@ -1,30 +0,0 @@
{-# LANGUAGE BangPatterns #-}
module Timer (timer) where
import Control.Exception (evaluate)
import Data.Time.Clock.POSIX (getPOSIXTime)
import GHC.Float (FFFormat(..), formatRealFloat)
ickyRound :: Int -> Double -> String
ickyRound k = formatRealFloat FFFixed (Just k)
timer :: Int -> a -> (a -> b) -> IO String
timer count a0 f = do
let loop !k !fastest
| k <= 0 = return fastest
| otherwise = do
start <- getPOSIXTime
let inner a i
| i <= 0 = return ()
| otherwise = evaluate (f a) >> inner a (i-1)
inner a0 count
end <- getPOSIXTime
let elapsed = end - start
loop (k-1) (min fastest (elapsed / fromIntegral count))
t <- loop (3::Int) 1e300
let log10 x = log x / log 10
ft = realToFrac t
prec = round (log10 (fromIntegral count) - log10 ft)
return $! ickyRound prec ft
{-# NOINLINE timer #-}

View File

@ -1,2 +0,0 @@
__pycache__
*.pyc

View File

@ -1,12 +0,0 @@
#!/usr/bin/env python
import utils, sys, codecs
def cut(filename, l, r):
content = open(filename, encoding='utf-8')
for line in content:
print(line[l:r])
for f in sys.argv[1:]:
t = utils.benchmark(lambda: cut(f, 20, 40))
sys.stderr.write('{0}: {1}\n'.format(f, t))

View File

@ -1,50 +0,0 @@
#!/usr/bin/env python
import math
import sys
import time
def find_first():
cf = contents.find
return timer(lambda: cf("en:Benin"))
def timer(f, count=100):
a = 1e300
def g():
return
for i in xrange(3):
start = time.time()
for j in xrange(count):
g()
a = min(a, (time.time() - start) / count)
b = 1e300
for i in xrange(3):
start = time.time()
for j in xrange(count):
f()
b = min(b, (time.time() - start) / count)
return round(b - a, int(round(math.log(count, 10) - math.log(b - a, 10))))
contents = open('../../tests/text-test-data/yiwiki.xml', 'r').read()
contents = contents.decode('utf-8')
benchmarks = (
find_first,
)
to_run = sys.argv[1:]
bms = []
if to_run:
for r in to_run:
for b in benchmarks:
if b.__name__.startswith(r):
bms.append(b)
else:
bms = benchmarks
for b in bms:
sys.stdout.write(b.__name__ + ' ')
sys.stdout.flush()
print b()

View File

@ -1,13 +0,0 @@
#!/usr/bin/env python
import utils, sys, codecs
def sort(filename):
content = open(filename, encoding='utf-8').read()
lines = content.splitlines()
lines.sort()
print('\n'.join(lines))
for f in sys.argv[1:]:
t = utils.benchmark(lambda: sort(f))
sys.stderr.write('{0}: {1}\n'.format(f, t))

View File

@ -1,25 +0,0 @@
#!/usr/bin/env python
import utils, sys
def strip_tags(filename):
string = open(filename, encoding='utf-8').read()
d = 0
out = []
for c in string:
if c == '<': d += 1
if d > 0:
out += ' '
else:
out += c
if c == '>': d -= 1
print(''.join(out))
for f in sys.argv[1:]:
t = utils.benchmark(lambda: strip_tags(f))
sys.stderr.write('{0}: {1}\n'.format(f, t))

View File

@ -1,18 +0,0 @@
#!/usr/bin/env python
import sys, time
def benchmark_once(f):
start = time.time()
f()
end = time.time()
return end - start
def benchmark(f):
runs = 100
total = 0.0
for i in range(runs):
result = benchmark_once(f)
sys.stderr.write('Run {0}: {1}\n'.format(i, result))
total += result
return total / runs

View File

@ -1,16 +0,0 @@
#!/usr/bin/env ruby
require './utils.rb'
def cut(filename, l, r)
File.open(filename, 'r:utf-8') do |file|
file.each_line do |line|
puts line[l, r - l]
end
end
end
ARGV.each do |f|
t = benchmark { cut(f, 20, 40) }
STDERR.puts "#{f}: #{t}"
end

View File

@ -1,50 +0,0 @@
#!/usr/bin/env ruby
require './utils.rb'
def fold(filename, max_width)
File.open(filename, 'r:utf-8') do |file|
# Words in this paragraph
paragraph = []
file.each_line do |line|
# If we encounter an empty line, we reformat and dump the current
# paragraph
if line.strip.empty?
puts fold_paragraph(paragraph, max_width)
puts
paragraph = []
# Otherwise, we append the words found in the line to the paragraph
else
paragraph.concat line.split
end
end
# Last paragraph
puts fold_paragraph(paragraph, max_width) unless paragraph.empty?
end
end
# Fold a single paragraph to the desired width
def fold_paragraph(paragraph, max_width)
# Gradually build our output
str, *rest = paragraph
width = str.length
rest.each do |word|
if width + word.length + 1 <= max_width
str << ' ' << word
width += word.length + 1
else
str << "\n" << word
width = word.length
end
end
str
end
ARGV.each do |f|
t = benchmark { fold(f, 80) }
STDERR.puts "#{f}: #{t}"
end

View File

@ -1,15 +0,0 @@
#!/usr/bin/env ruby
require './utils.rb'
def sort(filename)
File.open(filename, 'r:utf-8') do |file|
content = file.read
puts content.lines.sort.join
end
end
ARGV.each do |f|
t = benchmark { sort(f) }
STDERR.puts "#{f}: #{t}"
end

View File

@ -1,22 +0,0 @@
#!/usr/bin/env ruby
require './utils.rb'
def strip_tags(filename)
File.open(filename, 'r:utf-8') do |file|
str = file.read
d = 0
str.each_char do |c|
d += 1 if c == '<'
putc(if d > 0 then ' ' else c end)
d -= 1 if c == '>'
end
end
end
ARGV.each do |f|
t = benchmark { strip_tags(f) }
STDERR.puts "#{f}: #{t}"
end

View File

@ -1,14 +0,0 @@
require 'benchmark'
def benchmark(&block)
runs = 100
total = 0
runs.times do |i|
result = Benchmark.measure(&block).total
$stderr.puts "Run #{i}: #{result}"
total += result
end
total / runs
end

View File

@ -1,148 +0,0 @@
cabal-version: 1.12
name: text-benchmarks
version: 0.0.0.0
synopsis: Benchmarks for the text package
description: Benchmarks for the text package
homepage: https://bitbucket.org/bos/text
license: BSD2
license-file: ../LICENSE
author: Jasper Van der Jeugt <jaspervdj@gmail.com>,
Bryan O'Sullivan <bos@serpentine.com>,
Tom Harper <rtomharper@googlemail.com>,
Duncan Coutts <duncan@haskell.org>
maintainer: jaspervdj@gmail.com
category: Text
build-type: Simple
flag bytestring-builder
description: Depend on the bytestring-builder package for backwards compatibility.
default: False
manual: False
flag llvm
description: use LLVM
default: False
manual: True
executable text-benchmarks
ghc-options: -Wall -O2 -rtsopts
if flag(llvm)
ghc-options: -fllvm
cpp-options: -DINTEGER_GMP
build-depends: array,
base == 4.*,
binary,
blaze-builder,
bytestring-lexing >= 0.5.0,
containers,
criterion >= 0.10.0.0,
deepseq,
directory,
filepath,
ghc-prim,
integer-gmp,
stringsearch,
template-haskell,
transformers,
utf8-string,
vector
if flag(bytestring-builder)
build-depends: bytestring >= 0.9 && < 0.10.4,
bytestring-builder >= 0.10.4
else
build-depends: bytestring >= 0.10.4
-- modules for benchmark proper
c-sources: cbits/time_iconv.c
hs-source-dirs: haskell
main-is: Benchmarks.hs
other-modules:
Benchmarks.Builder
Benchmarks.Concat
Benchmarks.DecodeUtf8
Benchmarks.EncodeUtf8
Benchmarks.Equality
Benchmarks.FileRead
Benchmarks.FoldLines
Benchmarks.Mul
Benchmarks.Programs.BigTable
Benchmarks.Programs.Cut
Benchmarks.Programs.Fold
Benchmarks.Programs.Sort
Benchmarks.Programs.StripTags
Benchmarks.Programs.Throughput
Benchmarks.Pure
Benchmarks.ReadNumbers
Benchmarks.Replace
Benchmarks.Search
Benchmarks.Stream
Benchmarks.WordFrequencies
-- Source code for IUT (implementation under test)
-- "borrowed" from parent folder
include-dirs: ../include
c-sources: ../cbits/cbits.c
hs-source-dirs: ..
other-modules:
Data.Text
Data.Text.Array
Data.Text.Encoding
Data.Text.Encoding.Error
Data.Text.Foreign
Data.Text.IO
Data.Text.Internal
Data.Text.Internal.Builder
Data.Text.Internal.Builder.Functions
Data.Text.Internal.Builder.Int.Digits
Data.Text.Internal.Builder.RealFloat.Functions
Data.Text.Internal.Encoding.Fusion
Data.Text.Internal.Encoding.Fusion.Common
Data.Text.Internal.Encoding.Utf16
Data.Text.Internal.Encoding.Utf32
Data.Text.Internal.Encoding.Utf8
Data.Text.Internal.Functions
Data.Text.Internal.Fusion
Data.Text.Internal.Fusion.CaseMapping
Data.Text.Internal.Fusion.Common
Data.Text.Internal.Fusion.Size
Data.Text.Internal.Fusion.Types
Data.Text.Internal.IO
Data.Text.Internal.Lazy
Data.Text.Internal.Lazy.Encoding.Fusion
Data.Text.Internal.Lazy.Fusion
Data.Text.Internal.Lazy.Search
Data.Text.Internal.Private
Data.Text.Internal.Read
Data.Text.Internal.Search
Data.Text.Internal.Unsafe
Data.Text.Internal.Unsafe.Char
Data.Text.Internal.Unsafe.Shift
Data.Text.Lazy
Data.Text.Lazy.Builder
Data.Text.Lazy.Builder.Int
Data.Text.Lazy.Builder.RealFloat
Data.Text.Lazy.Encoding
Data.Text.Lazy.IO
Data.Text.Lazy.Internal
Data.Text.Lazy.Read
Data.Text.Read
Data.Text.Unsafe
Data.Text.Show
default-language: Haskell2010
default-extensions: NondecreasingIndentation
executable text-multilang
hs-source-dirs: haskell
main-is: Multilang.hs
other-modules: Timer
ghc-options: -Wall -O2
build-depends: base == 4.*,
bytestring,
text,
time
default-language: Haskell2010
default-extensions: NondecreasingIndentation

View File

@ -1,3 +0,0 @@
-- See http://cabal.readthedocs.io/en/latest/nix-local-build-overview.html
packages: ., benchmarks, th-tests
tests: True

View File

@ -1,226 +0,0 @@
### 1.2.4.0
* Add TH `Lift` instances for `Data.Text.Text` and `Data.Text.Lazy.Text` (gh-232)
* Update Haddock documentation to better reflect fusion eligibility; improve fusion
rules for `takeWhileEnd` and `length` (gh-241, ghc-202)
* Optimise `Data.Text.replicate` from `O(n)` to `O(log n)` (gh-209)
* Support `base-4.13.0.0`
### 1.2.3.1
* Make `decodeUtf8With` fail explicitly for unsupported non-BMP
replacement characters instead silent undefined behaviour (gh-213)
* Fix termination condition for file reads via `Data.Text.IO`
operations (gh-223)
* A serious correctness issue affecting uses of `take` and `drop` with
negative counts has been fixed (gh-227)
* A bug in the case-mapping functions resulting in unreasonably large
allocations with large arguments has been fixed (gh-221)
### 1.2.3.0
* Spec compliance: `toCaseFold` now follows the Unicode 9.0 spec
(updated from 8.0).
* Bug fix: the lazy `takeWhileEnd` function violated the
[lazy text invariant](https://github.com/bos/text/blob/1.2.3.0/Data/Text/Internal/Lazy.hs#L51)
(gh-184).
* Bug fix: Fixed usage of size hints causing incorrect behavior (gh-197).
* New function: `unsnoc` (gh-173).
* Reduce memory overhead in `encodeUTF8` (gh-194).
* Improve UTF-8 decoder error-recovery (gh-182).
* Minor documentation improvements (`@since` annotations, more
examples, clarifications).
#### 1.2.2.2
* The `toTitle` function now correctly handles letters that
immediately follow punctuation. Before, `"there's"` would turn into
`"There'S"`. Now, it becomes `"There's"`.
* The implementation of unstreaming is faster, resulting in operations
such as `map` and `intersperse` speeding up by up to 30%, with
smaller code generated.
* The optimised length comparison function is now more likely to be
used after some rewrite rule tweaking.
* Bug fix: an off-by-one bug in `takeEnd` is fixed.
* Bug fix: a logic error in `takeWord16` is fixed.
#### 1.2.2.1
* The switch to `integer-pure` in 1.2.2.0 was apparently mistaken.
The build flag has been renamed accordingly. Your army of diligent
maintainers apologizes for the churn.
* Spec compliance: `toCaseFold` now follows the Unicode 8.0 spec
(updated from 7.0)
* An STG lint error has been fixed
### 1.2.2.0
* The `integer-simple` package, upon which this package optionally
depended, has been replaced with `integer-pure`. The build flag has
been renamed accordingly.
* Bug fix: For the `Binary` instance, If UTF-8 decoding fails during a
`get`, the error is propagated via `fail` instead of an uncatchable
crash.
* New function: `takeWhileEnd`
* New instances for the `Text` types:
* if `base` >= 4.7: `PrintfArg`
* if `base` >= 4.9: `Semigroup`
#### 1.2.1.3
* Bug fix: As it turns out, moving the literal rewrite rules to simplifier
phase 2 does not prevent competition with the `unpack` rule, which is
also active in this phase. Unfortunately this was hidden due to a silly
test environment mistake. Moving literal rules back to phase 1 finally
fixes GHC Trac #10528 correctly.
#### 1.2.1.2
* Bug fix: Run literal rewrite rules in simplifier phase 2.
The behavior of the simplifier changed in GHC 7.10.2,
causing these rules to fail to fire, leading to poor code generation
and long compilation times. See
[GHC Trac #10528](https://ghc.haskell.org/trac/ghc/ticket/10528).
#### 1.2.1.1
* Expose unpackCString#, which you should never use.
### 1.2.1.0
* Added Binary instances for both Text types. (If you have previously
been using the text-binary package to get a Binary instance, it is
now obsolete.)
#### 1.2.0.6
* Fixed a space leak in UTF-8 decoding
#### 1.2.0.5
* Feature parity: repeat, cycle, iterate are now implemented for lazy
Text, and the Data instance is more complete
* Build speed: an inliner space explosion has been fixed with toCaseFold
* Bug fix: encoding Int to a Builder would infinite-loop if the
integer-simple package was used
* Deprecation: OnEncodeError and EncodeError are deprecated, as they
are never used
* Internals: some types that are used internally in fusion-related
functions have moved around, been renamed, or been deleted (we don't
bump the major version if .Internal modules change)
* Spec compliance: toCaseFold now follows the Unicode 7.0 spec
(updated from 6.3)
#### 1.2.0.4
* Fixed an incompatibility with base < 4.5
#### 1.2.0.3
* Update formatRealFloat to correspond to the definition in versions
of base newer than 4.5 (https://github.com/bos/text/issues/105)
#### 1.2.0.2
* Bumped lower bound on deepseq to 1.4 for compatibility with the
upcoming GHC 7.10
#### 1.2.0.1
* Fixed a buffer overflow in rendering of large Integers
(https://github.com/bos/text/issues/99)
## 1.2.0.0
* Fixed an integer overflow in the replace function
(https://github.com/bos/text/issues/81)
* Fixed a hang in lazy decodeUtf8With
(https://github.com/bos/text/issues/87)
* Reduced codegen bloat caused by use of empty and single-character
literals
* Added an instance of IsList for GHC 7.8 and above
### 1.1.1.0
* The Data.Data instance now allows gunfold to work, via a virtual
pack constructor
* dropEnd, takeEnd: new functions
* Comparing the length of a Text against a number can now
short-circuit in more cases
#### 1.1.0.1
* streamDecodeUtf8: fixed gh-70, did not return all unconsumed bytes
in single-byte chunks
## 1.1.0.0
* encodeUtf8: Performance is improved by up to 4x.
* encodeUtf8Builder, encodeUtf8BuilderEscaped: new functions,
available only if bytestring >= 0.10.4.0 is installed, that allow
very fast and flexible encoding of a Text value to a bytestring
Builder.
As an example of the performance gain to be had, the
encodeUtf8BuilderEscaped function helps to double the speed of JSON
encoding in the latest version of aeson! (Note: if all you need is a
plain ByteString, encodeUtf8 is still the faster way to go.)
* All of the internal module hierarchy is now publicly exposed. If a
module is in the .Internal hierarchy, or is documented as internal,
use at your own risk - there are no API stability guarantees for
internal modules!
#### 1.0.0.1
* decodeUtf8: Fixed a regression that caused us to incorrectly
identify truncated UTF-8 as valid (gh-61)
# 1.0.0.0
* Added support for Unicode 6.3.0 to case conversion functions
* New function toTitle converts words in a string to title case
* New functions peekCStringLen and withCStringLen simplify
interoperability with C functionns
* Added support for decoding UTF-8 in stream-friendly fashion
* Fixed a bug in mapAccumL
* Added trusted Haskell support
* Removed support for GHC 6.10 (released in 2008) and older

View File

@ -1,28 +0,0 @@
-- This script compares the strict and lazy Text APIs to ensure that
-- they're reasonably in sync.
{-# LANGUAGE OverloadedStrings #-}
import qualified Data.Set as S
import qualified Data.Text as T
import System.Process
main = do
let tidy pkg = (S.fromList . filter (T.isInfixOf "::") . T.lines .
T.replace "GHC.Int.Int64" "Int" .
T.replace "\n " "" .
T.replace (T.append (T.pack pkg) ".") "" . T.pack) `fmap`
readProcess "ghci" [] (":browse " ++ pkg)
let diff a b = mapM_ (putStrLn . (" "++) . T.unpack) . S.toList $
S.difference a b
text <- tidy "Data.Text"
lazy <- tidy "Data.Text.Lazy"
list <- tidy "Data.List"
putStrLn "Text \\ List:"
diff text list
putStrLn ""
putStrLn "Text \\ Lazy:"
diff text lazy
putStrLn ""
putStrLn "Lazy \\ Text:"
diff lazy text

View File

@ -1,44 +0,0 @@
module Arsec
(
Comment
, comment
, semi
, showC
, unichar
, unichars
, module Control.Applicative
, module Control.Monad
, module Data.Char
, module Text.ParserCombinators.Parsec.Char
, module Text.ParserCombinators.Parsec.Combinator
, module Text.ParserCombinators.Parsec.Error
, module Text.ParserCombinators.Parsec.Prim
) where
import Control.Monad
import Control.Applicative
import Data.Char
import Numeric
import Text.ParserCombinators.Parsec.Char hiding (lower, upper)
import Text.ParserCombinators.Parsec.Combinator hiding (optional)
import Text.ParserCombinators.Parsec.Error
import Text.ParserCombinators.Parsec.Prim hiding ((<|>), many)
type Comment = String
unichar :: Parser Char
unichar = chr . fst . head . readHex <$> many1 hexDigit
unichars :: Parser [Char]
unichars = manyTill (unichar <* spaces) semi
semi :: Parser ()
semi = char ';' *> spaces *> pure ()
comment :: Parser Comment
comment = (char '#' *> manyTill anyToken (char '\n')) <|> string "\n"
showC :: Char -> String
showC c = "'\\x" ++ d ++ "'"
where h = showHex (ord c) ""
d = replicate (4 - length h) '0' ++ h

Some files were not shown because too many files have changed in this diff Show More