Skip to content

Commit

Permalink
encoding: publicly expose identifier.{MIB,Interface}
Browse files Browse the repository at this point in the history
  • Loading branch information
rykov committed Dec 2, 2021
1 parent 18b340f commit 4aad7e7
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 70 deletions.
9 changes: 9 additions & 0 deletions encoding/ianaindex/ianaindex.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@ func (x *Index) Name(e encoding.Encoding) (string, error) {
return x.names(v), nil
}

// FindMIB searches encoding by MIBenum identifier
func (x *Index) FindMIB(mib identifier.MIB) (encoding.Encoding, error) {
v := findMIB(x.toMIB, mib)
if v == -1 {
return nil, errUnsupported
}
return x.enc[v], nil
}

// TODO: the coverage of this index is rather spotty. Allowing users to set
// encodings would allow:
// - users to increase coverage
Expand Down
15 changes: 15 additions & 0 deletions encoding/ianaindex/ianaindex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,21 @@ func TestEncoding(t *testing.T) {
if got, err := tc.index.Name(enc); got != tc.canonical {
t.Errorf("%d: Name(Encoding(%q)) = %q; want %q (%v)", i, tc.name, got, tc.canonical, err)
}

id, ok := enc.(identifier.Interface)
if !ok {
t.Errorf("%d: encoding %q has no ID", i, tc.name)
}
mib, _ := id.ID()
if mib == 0 {
t.Errorf("%d: encoding %q returned 0 MIB enum", i, tc.name)
}
mibEnc, err := tc.index.FindMIB(mib)
if err != nil {
t.Errorf("%d: FindMIB error %q", i, err)
} else if mibEnc != enc {
t.Errorf("%d: FindMIB did not match encoding", i)
}
}
}

Expand Down
79 changes: 79 additions & 0 deletions encoding/identifier/identifier.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package identifier defines the contract between implementations of Encoding
// and Index by defining identifiers that uniquely identify standardized coded
// character sets (CCS) and character encoding schemes (CES), which we will
// together refer to as encodings, for which Encoding implementations provide
// converters to and from UTF-8. This package is typically only of concern to
// implementers of Indexes and Encodings.
//
// One part of the identifier is the MIB code, which is defined by IANA and
// uniquely identifies a CCS or CES. Each code is associated with data that
// references authorities, official documentation as well as aliases and MIME
// names.
//
// Not all CESs are covered by the IANA registry. The "other" string that is
// returned by ID can be used to identify other character sets or versions of
// existing ones.
//
// It is recommended that each package that provides a set of Encodings provide
// the All and Common variables to reference all supported encodings and
// commonly used subset. This allows Index implementations to include all
// available encodings without explicitly referencing or knowing about them.
package identifier

// Note: this package is internal, but could be made public if there is a need
// for writing third-party Indexes and Encodings.

// References:
// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
// - http://www.ietf.org/rfc/rfc2978.txt
// - https://www.unicode.org/reports/tr22/
// - http://www.w3.org/TR/encoding/
// - https://encoding.spec.whatwg.org/
// - https://encoding.spec.whatwg.org/encodings.json
// - https://tools.ietf.org/html/rfc6657#section-5

// Interface can be implemented by Encodings to define the CCS or CES for which
// it implements conversions.
type Interface interface {
// ID returns an encoding identifier. Exactly one of the mib and other
// values should be non-zero.
//
// In the usual case it is only necessary to indicate the MIB code. The
// other string can be used to specify encodings for which there is no MIB,
// such as "x-mac-dingbat".
//
// The other string may only contain the characters a-z, A-Z, 0-9, - and _.
ID() (mib MIB, other string)

// NOTE: the restrictions on the encoding are to allow extending the syntax
// with additional information such as versions, vendors and other variants.
}

// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds
// some identifiers for some encodings that are not covered by the IANA
// standard.
//
// See http://www.iana.org/assignments/ianacharset-mib.
type MIB uint16

// These additional MIB types are not defined in IANA. They are added because
// they are common and defined within the text repo.
const (
// Unofficial marks the start of encodings not registered by IANA.
Unofficial MIB = 10000 + iota

// Replacement is the WhatWG replacement encoding.
Replacement

// XUserDefined is the code for x-user-defined.
XUserDefined

// MacintoshCyrillic is the code for x-mac-cyrillic.
MacintoshCyrillic
)
79 changes: 9 additions & 70 deletions encoding/internal/identifier/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,78 +4,17 @@

//go:generate go run gen.go

// Package identifier defines the contract between implementations of Encoding
// and Index by defining identifiers that uniquely identify standardized coded
// character sets (CCS) and character encoding schemes (CES), which we will
// together refer to as encodings, for which Encoding implementations provide
// converters to and from UTF-8. This package is typically only of concern to
// implementers of Indexes and Encodings.
//
// One part of the identifier is the MIB code, which is defined by IANA and
// uniquely identifies a CCS or CES. Each code is associated with data that
// references authorities, official documentation as well as aliases and MIME
// names.
//
// Not all CESs are covered by the IANA registry. The "other" string that is
// returned by ID can be used to identify other character sets or versions of
// existing ones.
//
// It is recommended that each package that provides a set of Encodings provide
// the All and Common variables to reference all supported encodings and
// commonly used subset. This allows Index implementations to include all
// available encodings without explicitly referencing or knowing about them.
package identifier

// Note: this package is internal, but could be made public if there is a need
// for writing third-party Indexes and Encodings.

// References:
// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
// - http://www.ietf.org/rfc/rfc2978.txt
// - https://www.unicode.org/reports/tr22/
// - http://www.w3.org/TR/encoding/
// - https://encoding.spec.whatwg.org/
// - https://encoding.spec.whatwg.org/encodings.json
// - https://tools.ietf.org/html/rfc6657#section-5

// Interface can be implemented by Encodings to define the CCS or CES for which
// it implements conversions.
type Interface interface {
// ID returns an encoding identifier. Exactly one of the mib and other
// values should be non-zero.
//
// In the usual case it is only necessary to indicate the MIB code. The
// other string can be used to specify encodings for which there is no MIB,
// such as "x-mac-dingbat".
//
// The other string may only contain the characters a-z, A-Z, 0-9, - and _.
ID() (mib MIB, other string)

// NOTE: the restrictions on the encoding are to allow extending the syntax
// with additional information such as versions, vendors and other variants.
}

// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds
// some identifiers for some encodings that are not covered by the IANA
// standard.
//
// See http://www.iana.org/assignments/ianacharset-mib.
type MIB uint16

// These additional MIB types are not defined in IANA. They are added because
// they are common and defined within the text repo.
const (
// Unofficial marks the start of encodings not registered by IANA.
Unofficial MIB = 10000 + iota
import (
"golang.org/x/text/encoding/identifier"
)

// Replacement is the WhatWG replacement encoding.
Replacement
var Replacement = identifier.Replacement

// XUserDefined is the code for x-user-defined.
XUserDefined
type Interface = identifier.Interface
type MIB = identifier.MIB

// MacintoshCyrillic is the code for x-mac-cyrillic.
MacintoshCyrillic
)
var Unofficial = identifier.Unofficial
var MacintoshCyrillic = identifier.MacintoshCyrillic
var XUserDefined = identifier.XUserDefined

0 comments on commit 4aad7e7

Please sign in to comment.