Skip to content

Commit 675471c

Browse files
authored
RawRecord PR follow-up - address DGollings' comment (#139)
1 parent 4aa56d6 commit 675471c

File tree

9 files changed

+82
-56
lines changed

9 files changed

+82
-56
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ situations.
6565
- Golang 1.14
6666

6767
## Recent Major Feature Additions/Changes
68-
- Added `Transform.CurrentRawRecord()` for caller of omniparser to access the raw ingested record.
68+
- Added `Transform.RawRecord()` for caller of omniparser to access the raw ingested record.
6969
- Deprecated `custom_parse` in favor of `custom_func` (`custom_parse` is still usable for
7070
back-compatibility, it is just removed from all public docs and samples).
7171
- Added `NonValidatingReader` EDI segment reader.

doc/gettingstarted.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,10 @@ for {
715715
break
716716
}
717717
if err != nil { ... }
718-
// output contains a []byte of the ingested and transformed record.
718+
// output contains a []byte of the ingested and transformed record.
719+
720+
// Also transform.RawRecord() gives you access to the raw record.
721+
fmt.Println(transform.RawRecord().Checksum())
719722
}
720723
```
721724

doc/programmability.md

+3-5
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,9 @@ for {
3535
}
3636
if err != nil { ... }
3737
// output contains a []byte of the ingested and transformed record.
38-
39-
raw, err := transform.CurrentRawRecord()
40-
if err != nil { ... }
41-
rawRecord := raw.(*omniv21.RawRecord) // assuming the schema is of `omni.2.1` version.
42-
fmt.Println(rawRecord.UUIDv3()) // rawRecord.UUIDv3() returns a stable hash of the current raw record.
38+
39+
// Also transform.RawRecord() gives you access to the raw record.
40+
fmt.Println(transform.RawRecord().Checksum())
4341
}
4442
```
4543
Note this out-of-box omniparser setup contains only the `omni.2.1` schema handler, meaning only schemas

extensions/omniv21/ingester.go

+16-14
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,21 @@ import (
99
"github.com/jf-tech/omniparser/extensions/omniv21/fileformat"
1010
"github.com/jf-tech/omniparser/extensions/omniv21/transform"
1111
"github.com/jf-tech/omniparser/idr"
12+
"github.com/jf-tech/omniparser/schemahandler"
1213
"github.com/jf-tech/omniparser/transformctx"
1314
)
1415

15-
// RawRecord contains the raw data ingested in from the input stream in the form of an IDR tree.
16-
// Note callers outside this package should absolutely make **NO** modifications to the content of
17-
// RawRecord. Treat it like read-only.
18-
type RawRecord struct {
19-
Node *idr.Node
16+
type rawRecord struct {
17+
node *idr.Node
2018
}
2119

22-
// UUIDv3 returns a stable MD5(v3) hash of the RawRecord.
23-
func (rr *RawRecord) UUIDv3() string {
24-
hash, _ := customfuncs.UUIDv3(nil, idr.JSONify2(rr.Node))
20+
func (rr *rawRecord) Raw() interface{} {
21+
return rr.node
22+
}
23+
24+
// Checksum returns a stable MD5(v3) hash of the rawRecord.
25+
func (rr *rawRecord) Checksum() string {
26+
hash, _ := customfuncs.UUIDv3(nil, idr.JSONify2(rr.node))
2527
return hash
2628
}
2729

@@ -31,19 +33,19 @@ type ingester struct {
3133
customParseFuncs transform.CustomParseFuncs // Deprecated.
3234
ctx *transformctx.Ctx
3335
reader fileformat.FormatReader
34-
rawRecord RawRecord
36+
rawRecord rawRecord
3537
}
3638

3739
// Read ingests a raw record from the input stream, transforms it according the given schema and return
3840
// the raw record, transformed JSON bytes.
39-
func (g *ingester) Read() (interface{}, []byte, error) {
40-
if g.rawRecord.Node != nil {
41-
g.reader.Release(g.rawRecord.Node)
42-
g.rawRecord.Node = nil
41+
func (g *ingester) Read() (schemahandler.RawRecord, []byte, error) {
42+
if g.rawRecord.node != nil {
43+
g.reader.Release(g.rawRecord.node)
44+
g.rawRecord.node = nil
4345
}
4446
n, err := g.reader.Read()
4547
if n != nil {
46-
g.rawRecord.Node = n
48+
g.rawRecord.node = n
4749
}
4850
if err != nil {
4951
// Read() supposed to have already done CtxAwareErr error wrapping. So directly return.

extensions/omniv21/ingester_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ func TestIngester_Read_Success(t *testing.T) {
9191
}
9292
raw, b, err := g.Read()
9393
assert.NoError(t, err)
94-
assert.Equal(t, "41665284-dab9-300d-b647-7ace9cb514b4", raw.(*RawRecord).UUIDv3())
94+
assert.Equal(t, "41665284-dab9-300d-b647-7ace9cb514b4", raw.Checksum())
95+
assert.Equal(t, "{}", idr.JSONify2(raw.Raw().(*idr.Node)))
9596
assert.Equal(t, "123", string(b))
9697
assert.Equal(t, 0, g.reader.(*testReader).releaseCalled)
9798
raw, b, err = g.Read()

extensions/omniv21/samples/testCommon.go

+3-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111
"github.com/stretchr/testify/assert"
1212

1313
"github.com/jf-tech/omniparser"
14-
"github.com/jf-tech/omniparser/extensions/omniv21"
1514
"github.com/jf-tech/omniparser/idr"
1615
"github.com/jf-tech/omniparser/transformctx"
1716
)
@@ -49,12 +48,11 @@ func SampleTestCommon(t *testing.T, schemaFile, inputFile string) string {
4948
err = json.Unmarshal(recordBytes, &transformed)
5049
assert.NoError(t, err)
5150

52-
raw, err := transform.CurrentRawRecord()
51+
raw, err := transform.RawRecord()
5352
assert.NoError(t, err)
54-
rawRecord := raw.(*omniv21.RawRecord)
5553
records = append(records, record{
56-
RawRecord: idr.JSONify2(rawRecord.Node),
57-
RawRecordHash: rawRecord.UUIDv3(),
54+
RawRecord: idr.JSONify2(raw.Raw().(*idr.Node)),
55+
RawRecordHash: raw.Checksum(),
5856
TransformedRecord: transformed,
5957
})
6058
}

schemahandler/schemaHandler.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ type SchemaHandler interface {
3737
NewIngester(ctx *transformctx.Ctx, input io.Reader) (Ingester, error)
3838
}
3939

40+
// RawRecord represents a raw record ingested from the input.
41+
type RawRecord interface {
42+
// Raw returns the actual raw record that is version specific to each of the schema handler.
43+
Raw() interface{}
44+
// Checksum returns a UUIDv3 (MD5) stable hash of the raw record.
45+
Checksum() string
46+
}
47+
4048
// Ingester is an interface of ingestion and transformation for a given input stream.
4149
type Ingester interface {
4250
// Read is called repeatedly during the processing of an input stream. Each call it should return
@@ -46,7 +54,7 @@ type Ingester interface {
4654
// one record at a time, OR, processes and returns one record for each call. However, the overall
4755
// design principle of omniparser is to have streaming processing capability so memory won't be a
4856
// constraint when dealing with large input file. All built-in ingesters are implemented this way.
49-
Read() (interface{}, []byte, error)
57+
Read() (RawRecord, []byte, error)
5058

5159
// IsContinuableError is called to determine if the error returned by Read is fatal or not. After Read
5260
// is called, the result record or error will be returned to caller. After caller consumes record or

transform.go

+7-9
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,14 @@ type Transform interface {
2121
// return the same error.
2222
// Note if returned error isn't nil, then returned []byte will be nil.
2323
Read() ([]byte, error)
24-
// CurrentRawRecord returns the current raw record ingested from the input stream. If
25-
// the last Read call failed, or Read hasn't been called yet, it will return an error.
26-
// Each schema handler and extension has its own definition of what a raw record is
27-
// so please check their corresponding doc.
28-
CurrentRawRecord() (interface{}, error)
24+
// RawRecord returns the current raw record ingested from the input stream. If the last
25+
// Read call failed, or Read hasn't been called yet, it will return an error.
26+
RawRecord() (schemahandler.RawRecord, error)
2927
}
3028

3129
type transform struct {
3230
ingester schemahandler.Ingester
33-
lastRawRecord interface{}
31+
lastRawRecord schemahandler.RawRecord
3432
lastErr error
3533
}
3634

@@ -70,9 +68,9 @@ func (o *transform) Read() ([]byte, error) {
7068
return transformed, err
7169
}
7270

73-
// CurrentRawRecord returns the current raw record ingested from the input stream. If
74-
// the last Read call failed, or Read hasn't been called yet, it will return an error.
75-
func (o *transform) CurrentRawRecord() (interface{}, error) {
71+
// RawRecord returns the current raw record ingested from the input stream. If the last
72+
// Read call failed, or Read hasn't been called yet, it will return an error.
73+
func (o *transform) RawRecord() (schemahandler.RawRecord, error) {
7674
if o.lastErr != nil {
7775
return nil, o.lastErr
7876
}

transform_test.go

+37-19
Original file line numberDiff line numberDiff line change
@@ -9,26 +9,41 @@ import (
99
"github.com/stretchr/testify/assert"
1010

1111
"github.com/jf-tech/omniparser/errs"
12+
"github.com/jf-tech/omniparser/schemahandler"
1213
)
1314

1415
type testReadCall struct {
15-
record []byte
16+
result []byte
1617
err error
1718
}
1819

20+
func (trc testReadCall) Checksum() string {
21+
if trc.err != nil {
22+
panic("Checksum() called when err != nil")
23+
}
24+
return fmt.Sprintf("checksum of raw record of '%s'", string(trc.result))
25+
}
26+
27+
func (trc testReadCall) Raw() interface{} {
28+
if trc.err != nil {
29+
panic("Raw() called when err != nil")
30+
}
31+
return fmt.Sprintf("raw record of '%s'", string(trc.result))
32+
}
33+
1934
type testIngester struct {
2035
readCalled int
2136
readCalls []testReadCall
2237
continuableErrs map[error]bool
2338
}
2439

25-
func (g *testIngester) Read() (interface{}, []byte, error) {
40+
func (g *testIngester) Read() (schemahandler.RawRecord, []byte, error) {
2641
if g.readCalled >= len(g.readCalls) {
2742
panic(fmt.Sprintf("Read() called %d time(s), but not enough mock entries setup", g.readCalled))
2843
}
2944
r := g.readCalls[g.readCalled]
3045
g.readCalled++
31-
return fmt.Sprintf("raw record %d", g.readCalled-1), r.record, r.err
46+
return r, r.result, r.err
3247
}
3348

3449
func (g *testIngester) IsContinuableError(err error) bool {
@@ -45,9 +60,9 @@ func TestTransform_Read_EndWithEOF(t *testing.T) {
4560
tfm := &transform{
4661
ingester: &testIngester{
4762
readCalls: []testReadCall{
48-
{record: []byte("1st good read")},
63+
{result: []byte("1st good read")},
4964
{err: continuableErr1},
50-
{record: []byte("2nd good read")},
65+
{result: []byte("2nd good read")},
5166
{err: io.EOF},
5267
},
5368
continuableErrs: map[error]bool{continuableErr1: true},
@@ -56,32 +71,34 @@ func TestTransform_Read_EndWithEOF(t *testing.T) {
5671
record, err := tfm.Read()
5772
assert.NoError(t, err)
5873
assert.Equal(t, "1st good read", string(record))
59-
raw, err := tfm.CurrentRawRecord()
74+
raw, err := tfm.RawRecord()
6075
assert.NoError(t, err)
61-
assert.Equal(t, "raw record 0", raw.(string))
76+
assert.Equal(t, "raw record of '1st good read'", raw.Raw())
77+
assert.Equal(t, "checksum of raw record of '1st good read'", raw.Checksum())
6278

6379
record, err = tfm.Read()
6480
assert.Error(t, err)
6581
assert.True(t, errs.IsErrTransformFailed(err))
6682
assert.Equal(t, continuableErr1.Error(), err.Error())
6783
assert.Nil(t, record)
68-
raw, err = tfm.CurrentRawRecord()
84+
raw, err = tfm.RawRecord()
6985
assert.Error(t, err)
7086
assert.True(t, errs.IsErrTransformFailed(err))
7187
assert.Nil(t, raw)
7288

7389
record, err = tfm.Read()
7490
assert.NoError(t, err)
7591
assert.Equal(t, "2nd good read", string(record))
76-
raw, err = tfm.CurrentRawRecord()
92+
raw, err = tfm.RawRecord()
7793
assert.NoError(t, err)
78-
assert.Equal(t, "raw record 2", raw.(string))
94+
assert.Equal(t, "raw record of '2nd good read'", raw.Raw())
95+
assert.Equal(t, "checksum of raw record of '2nd good read'", raw.Checksum())
7996

8097
record, err = tfm.Read()
8198
assert.Error(t, err)
8299
assert.Equal(t, io.EOF, err)
83100
assert.Nil(t, record)
84-
raw, err = tfm.CurrentRawRecord()
101+
raw, err = tfm.RawRecord()
85102
assert.Error(t, err)
86103
assert.Equal(t, io.EOF, err)
87104
assert.Nil(t, raw)
@@ -91,7 +108,7 @@ func TestTransform_Read_EndWithEOF(t *testing.T) {
91108
assert.Error(t, err)
92109
assert.Equal(t, io.EOF, err)
93110
assert.Nil(t, record)
94-
raw, err = tfm.CurrentRawRecord()
111+
raw, err = tfm.RawRecord()
95112
assert.Error(t, err)
96113
assert.Equal(t, io.EOF, err)
97114
assert.Nil(t, raw)
@@ -101,24 +118,25 @@ func TestTransform_Read_EndWithNonContinuableError(t *testing.T) {
101118
tfm := &transform{
102119
ingester: &testIngester{
103120
readCalls: []testReadCall{
104-
{record: []byte("1st good read")},
121+
{result: []byte("1st good read")},
105122
{err: errors.New("fatal error")},
106123
},
107124
},
108125
}
109126
record, err := tfm.Read()
110127
assert.NoError(t, err)
111128
assert.Equal(t, "1st good read", string(record))
112-
raw, err := tfm.CurrentRawRecord()
129+
raw, err := tfm.RawRecord()
113130
assert.NoError(t, err)
114-
assert.Equal(t, "raw record 0", raw.(string))
131+
assert.Equal(t, "raw record of '1st good read'", raw.Raw())
132+
assert.Equal(t, "checksum of raw record of '1st good read'", raw.Checksum())
115133

116134
record, err = tfm.Read()
117135
assert.Error(t, err)
118136
assert.False(t, errs.IsErrTransformFailed(err))
119137
assert.Equal(t, "fatal error", err.Error())
120138
assert.Nil(t, record)
121-
raw, err = tfm.CurrentRawRecord()
139+
raw, err = tfm.RawRecord()
122140
assert.Error(t, err)
123141
assert.False(t, errs.IsErrTransformFailed(err))
124142
assert.Equal(t, "fatal error", err.Error())
@@ -129,15 +147,15 @@ func TestTransform_Read_EndWithNonContinuableError(t *testing.T) {
129147
assert.Error(t, err)
130148
assert.Equal(t, "fatal error", err.Error())
131149
assert.Nil(t, record)
132-
raw, err = tfm.CurrentRawRecord()
150+
raw, err = tfm.RawRecord()
133151
assert.Error(t, err)
134152
assert.Equal(t, "fatal error", err.Error())
135153
assert.Nil(t, raw)
136154
}
137155

138-
func TestTransform_CurrentRawRecord_CalledBeforeRead(t *testing.T) {
156+
func TestTransform_RawRecord_CalledBeforeRead(t *testing.T) {
139157
tfm := &transform{ingester: &testIngester{readCalls: []testReadCall{}}}
140-
raw, err := tfm.CurrentRawRecord()
158+
raw, err := tfm.RawRecord()
141159
assert.Error(t, err)
142160
assert.Equal(t, "must call Read first", err.Error())
143161
assert.Nil(t, raw)

0 commit comments

Comments
 (0)