Skip to content

Commit 84646ac

Browse files
authored
Support parsing and display pretty for StructType (#7469)
* support to parse and display pretty for StructType * remove struct type from the todo item
1 parent 22a2ef9 commit 84646ac

File tree

2 files changed

+108
-11
lines changed

2 files changed

+108
-11
lines changed

arrow-schema/src/datatype.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,22 @@ pub enum UnionMode {
458458

459459
impl fmt::Display for DataType {
460460
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
461-
write!(f, "{self:?}")
461+
match &self {
462+
DataType::Struct(fields) => {
463+
write!(f, "Struct(")?;
464+
if !fields.is_empty() {
465+
let fields_str = fields
466+
.iter()
467+
.map(|f| format!("{} {}", f.name(), f.data_type()))
468+
.collect::<Vec<_>>()
469+
.join(", ");
470+
write!(f, "{}", fields_str)?;
471+
}
472+
write!(f, ")")?;
473+
Ok(())
474+
}
475+
_ => write!(f, "{self:?}"),
476+
}
462477
}
463478
}
464479

arrow-schema/src/datatype_parse.rs

Lines changed: 92 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
1919

20-
use crate::{ArrowError, DataType, Field, IntervalUnit, TimeUnit};
20+
use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit};
2121

2222
pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
2323
Parser::new(val).parse()
@@ -78,6 +78,11 @@ impl<'a> Parser<'a> {
7878
Token::List => self.parse_list(),
7979
Token::LargeList => self.parse_large_list(),
8080
Token::FixedSizeList => self.parse_fixed_size_list(),
81+
Token::Struct => self.parse_struct(),
82+
Token::FieldName(word) => Err(make_error(
83+
self.val,
84+
&format!("unrecognized word: {}", word),
85+
)),
8186
tok => Err(make_error(
8287
self.val,
8388
&format!("finding next type, got unexpected '{tok}'"),
@@ -150,6 +155,10 @@ impl<'a> Parser<'a> {
150155
fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
151156
match self.next_token()? {
152157
Token::DoubleQuotedString(s) => Ok(s),
158+
Token::FieldName(word) => Err(make_error(
159+
self.val,
160+
&format!("unrecognized word: {}", word),
161+
)),
153162
tok => Err(make_error(
154163
self.val,
155164
&format!("finding double quoted string for {context}, got '{tok}'"),
@@ -291,6 +300,46 @@ impl<'a> Parser<'a> {
291300
Box::new(value_type),
292301
))
293302
}
303+
fn parse_struct(&mut self) -> ArrowResult<DataType> {
304+
self.expect_token(Token::LParen)?;
305+
let mut fields = Vec::new();
306+
loop {
307+
let field_name = match self.next_token()? {
308+
// It's valid to have a name that is a type name
309+
Token::SimpleType(data_type) => data_type.to_string(),
310+
Token::FieldName(name) => name,
311+
Token::RParen => {
312+
if fields.is_empty() {
313+
break;
314+
} else {
315+
return Err(make_error(
316+
self.val,
317+
"Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma",
318+
));
319+
}
320+
}
321+
tok => {
322+
return Err(make_error(
323+
self.val,
324+
&format!("Expected a word for the name of Struct, but got {tok}"),
325+
))
326+
}
327+
};
328+
let field_type = self.parse_next_type()?;
329+
fields.push(Arc::new(Field::new(field_name, field_type, true)));
330+
match self.next_token()? {
331+
Token::Comma => continue,
332+
Token::RParen => break,
333+
tok => {
334+
return Err(make_error(
335+
self.val,
336+
&format!("Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"),
337+
))
338+
}
339+
}
340+
}
341+
Ok(DataType::Struct(Fields::from(fields)))
342+
}
294343

295344
/// return the next token, or an error if there are none left
296345
fn next_token(&mut self) -> ArrowResult<Token> {
@@ -479,12 +528,9 @@ impl<'a> Tokenizer<'a> {
479528
"Some" => Token::Some,
480529
"None" => Token::None,
481530

482-
_ => {
483-
return Err(make_error(
484-
self.val,
485-
&format!("unrecognized word: {}", self.word),
486-
))
487-
}
531+
"Struct" => Token::Struct,
532+
// If we don't recognize the word, treat it as a field name
533+
word => Token::FieldName(word.to_string()),
488534
};
489535
Ok(token)
490536
}
@@ -546,6 +592,8 @@ enum Token {
546592
List,
547593
LargeList,
548594
FixedSizeList,
595+
Struct,
596+
FieldName(String),
549597
}
550598

551599
impl Display for Token {
@@ -573,6 +621,8 @@ impl Display for Token {
573621
Token::Dictionary => write!(f, "Dictionary"),
574622
Token::Integer(v) => write!(f, "Integer({v})"),
575623
Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
624+
Token::Struct => write!(f, "Struct"),
625+
Token::FieldName(s) => write!(f, "FieldName({s})"),
576626
}
577627
}
578628
}
@@ -680,7 +730,37 @@ mod test {
680730
DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
681731
),
682732
),
683-
// TODO support more structured types (List, LargeList, Struct, Union, Map, RunEndEncoded, etc)
733+
DataType::Struct(Fields::from(vec![
734+
Field::new("f1", DataType::Int64, true),
735+
Field::new("f2", DataType::Float64, true),
736+
Field::new(
737+
"f3",
738+
DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
739+
true,
740+
),
741+
Field::new(
742+
"f4",
743+
DataType::Dictionary(
744+
Box::new(DataType::Int8),
745+
Box::new(DataType::FixedSizeBinary(23)),
746+
),
747+
true,
748+
),
749+
])),
750+
DataType::Struct(Fields::from(vec![
751+
Field::new("Int64", DataType::Int64, true),
752+
Field::new("Float64", DataType::Float64, true),
753+
])),
754+
DataType::Struct(Fields::from(vec![
755+
Field::new("f1", DataType::Int64, true),
756+
Field::new(
757+
"nested_struct",
758+
DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
759+
true,
760+
),
761+
])),
762+
DataType::Struct(Fields::empty()),
763+
// TODO support more structured types (List, LargeList, Union, Map, RunEndEncoded, etc)
684764
]
685765
}
686766

@@ -754,11 +834,13 @@ mod test {
754834
("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"),
755835
("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
756836
("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),
757-
837+
("Struct(f1, Int64)", "Error finding next type, got unexpected ','"),
838+
("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"),
839+
("Struct(f1)", "Error finding next type, got unexpected ')'"),
758840
];
759841

760842
for (data_type_string, expected_message) in cases {
761-
print!("Parsing '{data_type_string}', expecting '{expected_message}'");
843+
println!("Parsing '{data_type_string}', expecting '{expected_message}'");
762844
match parse_data_type(data_type_string) {
763845
Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
764846
Err(e) => {

0 commit comments

Comments
 (0)