croissant
import "github.com/b13rg/croissant-go/pkg/croissant"
Croissant spec filetypes and relations.
Croissant spec filetypes and relations.
Croissant spec filetypes and relations.
Croissant spec filetypes and relations.
Croissant spec filetypes and relations.
Index
- Variables
- type ClassRefItem
- type ClassRefList
- func (ref ClassRefList) MarshalJSON() ([]byte, error)
- func (ref *ClassRefList) UnmarshalJSON(data []byte) error
- type ContentExtractionEnumeration
- func NewContentExtractionEnumeration() *ContentExtractionEnumeration
- type DataSet
- func NewDataSet() *DataSet
- func NewDataSetFromPath(filePath string) (*DataSet, error)
- func NewFileSet() *DataSet
- func (ds *DataSet) Validate() ([]types.CroissantWarning, []types.CroissantError)
- func (ds *DataSet) ValidateRecommendedProps() ([]types.CroissantWarning, []types.CroissantError)
- func (ds *DataSet) ValidateRequiredProps() []types.CroissantError
- func (ds *DataSet) WriteToFile(path string) error
- type DataSource
- func NewDataSource() *DataSource
- type DataType
- func NewDataType(mimeType string) *DataType
- type Distribution
- func (d *Distribution) UnmarshalJSON(data []byte) error
- type DistributionItem
- type Extract
- func NewExtract() *Extract
- type Field
- func NewField() *Field
- func (obj *Field) Validate() ([]types.CroissantWarning, []types.CroissantError)
- type FieldRef
- type FieldRefSlice
- func (ref FieldRefSlice) MarshalJSON() ([]byte, error)
- func (ref *FieldRefSlice) UnmarshalJSON(data []byte) error
- type FileObject
- func NewFileObject() *FileObject
- func (*FileObject) Update() error
- func (obj *FileObject) Validate() ([]types.CroissantWarning, []types.CroissantError)
- func (*FileObject) ValidateHash() error
- type FileSet
- func (*FileSet) Update() error
- func (obj *FileSet) Validate() ([]types.CroissantWarning, []types.CroissantError)
- type Format
- type RecordSet
- func NewRecordSet() *RecordSet
- func (obj *RecordSet) Validate() ([]types.CroissantWarning, []types.CroissantError)
- type Split
- func NewSplit(trainSplit string, testSplit string, validationSplit string) *Split
- type Transform
- func NewTransform() *Transform
Variables
The suggested context to use in a Croissant Json-LD file.
var SuggestedContext = map[string]interface{}{
"@language": "en",
"@vocab": "https://schema.org/",
"sc": "https://schema.org/",
"cr": "http://mlcommons.org/croissant/",
"rai": "http://mlcommons.org/croissant/RAI/",
"dct": "http://purl.org/dc/terms/",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"data": map[string]interface{}{
"@id": "cr:data",
"@type": "@json",
},
"dataType": map[string]interface{}{
"@id": "cr:dataType",
"@type": "@vocab",
},
"examples": map[string]interface{}{
"@id": "cr:examples",
"@type": "@json",
},
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"repeated": "cr:repeated",
"replace": "cr:replace",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform",
}
type ClassRefItem
type ClassRefItem struct {
// ID of the resource.
ID string `json:"@id,omitempty"`
}
type ClassRefList
type ClassRefList []ClassRefItem
func (ClassRefList) MarshalJSON
func (ref ClassRefList) MarshalJSON() ([]byte, error)
func (*ClassRefList) UnmarshalJSON
func (ref *ClassRefList) UnmarshalJSON(data []byte) error
type ContentExtractionEnumeration
type ContentExtractionEnumeration struct {
// Full path to file, from Croissant extraction or download folders.
FullPath string
// Name of the file (no path).
Filename string
// Byte content of the file.
Content string
// Byte content of each line of the file.
Lines string
// The numbers of each line in the file.
LineNumbers string
}
func NewContentExtractionEnumeration
func NewContentExtractionEnumeration() *ContentExtractionEnumeration
type DataSet
[Dataset Class](https://docs.mlcommons.org/croissant/docs/croissant-spec.html#dataset-level-information) Based on https://docs.mlcommons.org/croissant/docs/croissant-spec.html#schemaorgdataset
type DataSet struct {
// Context alias definitions to make rest of document shorter.
Context map[string]interface{} `json:"@context"`
// Must be `schema.org/Dataset`.
Type string `json:"@type"`
// The name of the dataset
Name string `json:"name"`
// Description of the dataset
Description string `json:"description"`
// Schema version the croissant file conforms to.
ConformsTo string `json:"conformsTo"`
// A citation to the dataset itself.
CiteAs string `json:"citeAs,omitempty"`
// Licenses of the dataset.
// Spec suggests using references from https://spdx.org/licenses/.
License types.StringOrSlice `json:"license"`
// Url of the dataset, usually a webpage.
URL string `json:"url"`
// One or more Person or Organizations that created the dataset.
Creator []string `json:"creator,omitempty"`
// The date the dataset was published.
DatePublished string `json:"datePublished,omitempty"`
// Keywords associated with the text
Keywords []string `json:"keywords,omitempty"`
// Publisher of the dataset, sometimes distinct from creator.
Publisher []string `json:"publisher,omitempty"`
// Version of the dataset.
// Either an single int, or a MAJOR.MINOR.PATCH sematic version string.
// [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html)
Version string `json:"version,omitempty"`
// Date the dataset was initially created
DateCreated string `json:"dateCreated,omitempty"`
// Date the dataset was last modified
DateModified string `json:"dateModified,omitempty"`
// List of URLs that represent the same dataset as this one.
SameAs []string `json:"sameAs,omitempty"`
// License that applies to the croissant metadata.
SdLicense []string `json:"sdLicense,omitempty"`
// Language of the content of the dataset.
InLanguage []string `json:"inLanguage,omitempty"`
// Whether the dataset is a live dataset (in-process of being updated).
IsLiveDataset bool `json:"isLiveDataset,omitempty"`
// List of FileObjects and FileSets associated with the dataset.
// Modified from schema.org/Dataset.
// Required.
Distribution Distribution `json:"distribution,omitempty"`
// List of RecordSets associated with the dataset
RecordSets []RecordSet `json:"recordSet"`
}
func NewDataSet
func NewDataSet() *DataSet
func NewDataSetFromPath
func NewDataSetFromPath(filePath string) (*DataSet, error)
func NewFileSet
func NewFileSet() *DataSet
func (*DataSet) Validate
func (ds *DataSet) Validate() ([]types.CroissantWarning, []types.CroissantError)
func (*DataSet) ValidateRecommendedProps
func (ds *DataSet) ValidateRecommendedProps() ([]types.CroissantWarning, []types.CroissantError)
func (*DataSet) ValidateRequiredProps
func (ds *DataSet) ValidateRequiredProps() []types.CroissantError
func (*DataSet) WriteToFile
func (ds *DataSet) WriteToFile(path string) error
type DataSource
type DataSource struct {
// Must be DataSource
NType *string `json:"@type,omitempty"`
// Node ID
ClassRefItem
// The name of the referenced FileObject source of the data.
FileObject *ClassRefItem `json:"fileObject,omitempty"`
// The name of the reference RecordSet source.
FileSet *ClassRefItem `json:"fileSet,omitempty"`
// The name of the referenced RecordSet source.
RecordSet *ClassRefItem `json:"recordSet,omitempty"`
// The extraction method from the provided source.
Extract *Extract `json:"extract,omitempty"`
// Transformations to apply to data after extraction.
Transform *Transform `json:"transform,omitempty"`
// A format to parse data values from text.
Format *Format `json:"format,omitempty"`
}
func NewDataSource
func NewDataSource() *DataSource
type DataType
type DataType struct {
// MIME type
DataType string
}
func NewDataType
func NewDataType(mimeType string) *DataType
type Distribution
Type used to group data resource objects together.
type Distribution []DistributionItem
func (*Distribution) UnmarshalJSON
func (d *Distribution) UnmarshalJSON(data []byte) error
type DistributionItem
type DistributionItem interface {
Validate() ([]types.CroissantWarning, []types.CroissantError)
}
type Extract
type Extract struct {
// Extraction method.
FileProperty string `json:"fileProperty,omitempty"`
// Name of the column (field) that contains values.
Column string `json:"column,omitempty"`
// A JSON path expression that obtains values.
JsonPath string `json:"jsonPath,omitempty"`
}
func NewExtract
func NewExtract() *Extract
type Field
type Field struct {
// Must be field.
Type string `json:"@type"`
// Node ID.
ClassRefItem
// Name of the Field.
Name string `json:"name"`
// Description of the Field.
Description string `json:"description"`
// The data types that correspond to the Field.
DataType types.StringOrSlice `json:"dataType,omitempty"`
// The source of data for the field.
Source *DataSource `json:"source,omitempty"`
// If true the Field is a list of DataType values.
Repeated bool `json:"repeated,omitempty"`
// A property URL that is equivalent to this field
EquivalentProperty string `json:"equivalentProperty,omitempty"`
// References to other fields in a different RecordSet.
References FieldRefSlice `json:"references,omitempty"`
// List of Fields nested inside this one.
SubField []Field `json:"subField,omitempty"`
// References to other Fields in the same RecordSet.
ParentField FieldRefSlice `json:"parentField,omitempty"`
}
func NewField
func NewField() *Field
func (*Field) Validate
func (obj *Field) Validate() ([]types.CroissantWarning, []types.CroissantError)
type FieldRef
type FieldRef struct {
Field ClassRefItem `json:"field"`
}
type FieldRefSlice
type FieldRefSlice []FieldRef
func (FieldRefSlice) MarshalJSON
func (ref FieldRefSlice) MarshalJSON() ([]byte, error)
func (*FieldRefSlice) UnmarshalJSON
func (ref *FieldRefSlice) UnmarshalJSON(data []byte) error
type FileObject
type FileObject struct {
// Must be FileObject
Type string `json:"@type"`
// Node ID.
ClassRefItem
// The name of the file.
Name string `json:"name"`
// Description of file.
Description string `json:"description"`
// URL to the actual bytes of the file object.
ContentURL string `json:"contentUrl"`
// File size in [mega/kilo/...]bytes.
// Defaults to bytes if unit not specified.
ContentSize string `json:"contentSize,omitempty"`
// Format of the file given as a MIME type.
EncodingFormat string `json:"encodingFormat"`
// Checksum of the file contents.
Sha256 string `json:"sha256,omitempty"`
// Another FileObject or FileSet this resource is contained in.
ContainedIn ClassRefList `json:"containedIn,omitempty"`
}
func NewFileObject
func NewFileObject() *FileObject
func (*FileObject) Update
func (*FileObject) Update() error
Update FileObject struct from resource.
func (*FileObject) Validate
func (obj *FileObject) Validate() ([]types.CroissantWarning, []types.CroissantError)
func (*FileObject) ValidateHash
func (*FileObject) ValidateHash() error
type FileSet
type FileSet struct {
// Must be FileSet.
Type string `json:"@type"`
// Node ID
ClassRefItem
// Name of FileSet
Name string `json:"name"`
// Description of FileSet
Description string `json:"description"`
// The FileSet or FileObject the resource is contained in.
ContainedIn ClassRefList `json:"containedIn"`
// MIME type
EncodingFormat string `json:"encodingFormat"`
// A glob pattern of files to include.
Includes string `json:"includes"`
// A glob patter of files to exclude.
Excludes string `json:"excludes,omitempty"`
}
func (*FileSet) Update
func (*FileSet) Update() error
Update FileSet struct from resource.
func (*FileSet) Validate
func (obj *FileSet) Validate() ([]types.CroissantWarning, []types.CroissantError)
type Format
type Format struct{}
type RecordSet
type RecordSet struct {
// Must be RecordSet
Type string `json:"@type"`
// Node ID
ClassRefItem
// Name of the RecordSet
Name string `json:"name"`
// Description of the RecordSet
Description string `json:"description"`
// The data types that correspond to all fields in the RecordSet.
DataType types.StringOrSlice `json:"dataType"`
// One or more Fields that uniquely identify records in the RecordSet.
Key ClassRefList `json:"key,omitempty"`
// List of data element Fields that appear in the RecordSet.
Field []Field `json:"field"`
// One or more records that constitute the data of the RecordSet.
Data []interface{} `json:"data,omitempty"`
// One or more records provided as an example of the RecordSet.
Examples []interface{} `json:"examples,omitempty"`
}
func NewRecordSet
func NewRecordSet() *RecordSet
func (*RecordSet) Validate
func (obj *RecordSet) Validate() ([]types.CroissantWarning, []types.CroissantError)
type Split
type Split struct {
TrainSplit string
TestSplit string
ValidationSplit string
}
func NewSplit
func NewSplit(trainSplit string, testSplit string, validationSplit string) *Split
type Transform
type Transform struct {
// Split data source string on character.
Delimiter string `json:"delimiter,omitempty"`
// Apply regex to data source.
Regex string `json:"regex,omitempty"`
// the path to extract json from.
JsonPath string `json:"jsonPath,omitempty"`
// A JSON query to evaluate against the data source.
JsonQuery string `json:"jsonquery,omitempty"`
}
func NewTransform
func NewTransform() *Transform