Skip to content

Commit dfd683d

Browse files
authored
support data in more byte-array/string types (#54)
1 parent f37a33d commit dfd683d

File tree

4 files changed

+32
-9
lines changed

4 files changed

+32
-9
lines changed

Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "StringEncodings"
22
uuid = "69024149-9ee7-55f6-a4c4-859efe599b68"
3-
version = "0.3.6"
3+
version = "0.3.7"
44

55
[deps]
66
Libiconv_jll = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"

src/StringEncodings.jl

+20-8
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ export encoding, encodings_list, Encoding, @enc_str
2020

2121
abstract type StringEncodingError end
2222

23+
# contiguous 1d byte arrays compatible with C `unsigned char *` API
24+
const ByteVector= Union{Vector{UInt8},
25+
Base.FastContiguousSubArray{UInt8,1,<:Array{UInt8,1}},
26+
Base.CodeUnits{UInt8, String}, Base.CodeUnits{UInt8, SubString{String}}}
27+
const ByteString = Union{String,SubString{String}}
28+
2329
# Specified encodings or the combination are not supported by iconv
2430
struct InvalidEncodingError <: StringEncodingError
2531
args::Tuple{String, String}
@@ -31,7 +37,7 @@ message(::Type{InvalidEncodingError}) = "Conversion from <<1>> to <<2>> not supp
3137
struct InvalidSequenceError <: StringEncodingError
3238
args::Tuple{String}
3339
end
34-
InvalidSequenceError(seq::Vector{UInt8}) = InvalidSequenceError((bytes2hex(seq),))
40+
InvalidSequenceError(seq::AbstractVector{UInt8}) = InvalidSequenceError((bytes2hex(seq),))
3541
message(::Type{InvalidSequenceError}) = "Byte sequence 0x<<1>> is invalid in source encoding or cannot be represented in target encoding"
3642

3743
struct IConvError <: StringEncodingError
@@ -123,7 +129,7 @@ function finalize(s::Union{StringEncoder, StringDecoder})
123129
nothing
124130
end
125131

126-
function iconv!(cd::Ptr{Nothing}, inbuf::Vector{UInt8}, outbuf::Vector{UInt8},
132+
function iconv!(cd::Ptr{Nothing}, inbuf::ByteVector, outbuf::ByteVector,
127133
inbufptr::Ref{Ptr{UInt8}}, outbufptr::Ref{Ptr{UInt8}},
128134
inbytesleft::Ref{Csize_t}, outbytesleft::Ref{Csize_t})
129135
inbufptr[] = pointer(inbuf)
@@ -499,14 +505,20 @@ end
499505
## Functions to encode/decode strings
500506

501507
"""
502-
decode([T,] a::Vector{UInt8}, enc)
508+
decode([T,] a::AbstractVector{UInt8}, enc)
503509
504510
Convert an array of bytes `a` representing text in encoding `enc` to a string of type `T`.
505511
By default, a `String` is returned.
506512
513+
To `decode` an `s::String` of data in non-UTF-8 encoding, use
514+
`decode(codeunits(s), enc)` to act on the underlying byte array.
515+
507516
`enc` can be specified either as a string or as an `Encoding` object.
517+
The input data `a` can be a `Vector{UInt8}` of bytes, a contiguous
518+
subarray thereof, or the `codeunits` of a `String` (or substring
519+
thereof).
508520
"""
509-
function decode(::Type{T}, a::Vector{UInt8}, enc::Encoding) where {T<:AbstractString}
521+
function decode(::Type{T}, a::ByteVector, enc::Encoding) where {T<:AbstractString}
510522
b = IOBuffer(a)
511523
try
512524
T(read(StringDecoder(b, enc, encoding(T))))
@@ -515,19 +527,19 @@ function decode(::Type{T}, a::Vector{UInt8}, enc::Encoding) where {T<:AbstractSt
515527
end
516528
end
517529

518-
decode(::Type{T}, a::Vector{UInt8}, enc::AbstractString) where {T<:AbstractString} =
530+
decode(::Type{T}, a::ByteVector, enc::AbstractString) where {T<:AbstractString} =
519531
decode(T, a, Encoding(enc))
520532

521-
decode(a::Vector{UInt8}, enc::AbstractString) = decode(String, a, Encoding(enc))
522-
decode(a::Vector{UInt8}, enc::Union{AbstractString, Encoding}) = decode(String, a, enc)
533+
decode(a::ByteVector, enc::Union{AbstractString, Encoding}) = decode(String, a, enc)
523534

524535
"""
525536
encode(s::AbstractString, enc)
526537
527538
Convert string `s` to an array of bytes representing text in encoding `enc`.
528539
`enc` can be specified either as a string or as an `Encoding` object.
529540
"""
530-
function encode(s::AbstractString, enc::Encoding)
541+
encode(s::AbstractString, enc::Encoding) = encode(String(s), enc)
542+
function encode(s::ByteString, enc::Encoding)
531543
b = IOBuffer()
532544
p = StringEncoder(b, enc, encoding(typeof(s)))
533545
write(p, s)

src/encodings.jl

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ print(io::IO, ::Encoding{enc}) where {enc} = print(io, enc)
2323

2424
## Get the encoding used by a string type
2525
encoding(::Type{String}) = enc"UTF-8"
26+
encoding(::Type{SubString{String}}) = enc"UTF-8"
2627

2728
encodings_list = ["1026", "1046", "1047", "10646-1:1993", "10646-1:1993/UCS4",
2829
"437", "500", "500V1", "850", "851", "852", "855", "856", "857",

test/runtests.jl

+10
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,16 @@ end
270270
@test_throws ArgumentError readavailable(p)
271271
end
272272

273+
# make sure encode/decode support various string/array types
274+
@testset "Array/String types" begin
275+
s = "Bendaña"
276+
enc = "Windows-1252"
277+
se = "Benda\xf1a"
278+
@test encode(Test.GenericString(s), enc) == codeunits(se)
279+
@test encode(SubString(s, 1:6), enc) == encode(s[1:6], enc) == codeunits(se)[1:6]
280+
@test s == decode(codeunits(se), enc) == decode(collect(codeunits(se)), enc)
281+
@test s[1:6] == decode(@view(collect(codeunits(se))[1:6]), enc)
282+
end
273283

274284
## Test encodings support
275285
b = IOBuffer()

0 commit comments

Comments
 (0)