|
17 | 17 |
|
18 | 18 | use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
|
19 | 19 |
|
20 |
| -use crate::{ArrowError, DataType, Field, IntervalUnit, TimeUnit}; |
| 20 | +use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit}; |
21 | 21 |
|
22 | 22 | pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
|
23 | 23 | Parser::new(val).parse()
|
@@ -78,6 +78,11 @@ impl<'a> Parser<'a> {
|
78 | 78 | Token::List => self.parse_list(),
|
79 | 79 | Token::LargeList => self.parse_large_list(),
|
80 | 80 | Token::FixedSizeList => self.parse_fixed_size_list(),
|
| 81 | + Token::Struct => self.parse_struct(), |
| 82 | + Token::FieldName(word) => Err(make_error( |
| 83 | + self.val, |
| 84 | + &format!("unrecognized word: {}", word), |
| 85 | + )), |
81 | 86 | tok => Err(make_error(
|
82 | 87 | self.val,
|
83 | 88 | &format!("finding next type, got unexpected '{tok}'"),
|
@@ -150,6 +155,10 @@ impl<'a> Parser<'a> {
|
150 | 155 | fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
|
151 | 156 | match self.next_token()? {
|
152 | 157 | Token::DoubleQuotedString(s) => Ok(s),
|
| 158 | + Token::FieldName(word) => Err(make_error( |
| 159 | + self.val, |
| 160 | + &format!("unrecognized word: {}", word), |
| 161 | + )), |
153 | 162 | tok => Err(make_error(
|
154 | 163 | self.val,
|
155 | 164 | &format!("finding double quoted string for {context}, got '{tok}'"),
|
@@ -291,6 +300,46 @@ impl<'a> Parser<'a> {
|
291 | 300 | Box::new(value_type),
|
292 | 301 | ))
|
293 | 302 | }
|
| 303 | + fn parse_struct(&mut self) -> ArrowResult<DataType> { |
| 304 | + self.expect_token(Token::LParen)?; |
| 305 | + let mut fields = Vec::new(); |
| 306 | + loop { |
| 307 | + let field_name = match self.next_token()? { |
| 308 | + // It's valid to have a name that is a type name |
| 309 | + Token::SimpleType(data_type) => data_type.to_string(), |
| 310 | + Token::FieldName(name) => name, |
| 311 | + Token::RParen => { |
| 312 | + if fields.is_empty() { |
| 313 | + break; |
| 314 | + } else { |
| 315 | + return Err(make_error( |
| 316 | + self.val, |
| 317 | + "Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma", |
| 318 | + )); |
| 319 | + } |
| 320 | + } |
| 321 | + tok => { |
| 322 | + return Err(make_error( |
| 323 | + self.val, |
| 324 | + &format!("Expected a word for the name of Struct, but got {tok}"), |
| 325 | + )) |
| 326 | + } |
| 327 | + }; |
| 328 | + let field_type = self.parse_next_type()?; |
| 329 | + fields.push(Arc::new(Field::new(field_name, field_type, true))); |
| 330 | + match self.next_token()? { |
| 331 | + Token::Comma => continue, |
| 332 | + Token::RParen => break, |
| 333 | + tok => { |
| 334 | + return Err(make_error( |
| 335 | + self.val, |
| 336 | + &format!("Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"), |
| 337 | + )) |
| 338 | + } |
| 339 | + } |
| 340 | + } |
| 341 | + Ok(DataType::Struct(Fields::from(fields))) |
| 342 | + } |
294 | 343 |
|
295 | 344 | /// return the next token, or an error if there are none left
|
296 | 345 | fn next_token(&mut self) -> ArrowResult<Token> {
|
@@ -479,12 +528,9 @@ impl<'a> Tokenizer<'a> {
|
479 | 528 | "Some" => Token::Some,
|
480 | 529 | "None" => Token::None,
|
481 | 530 |
|
482 |
| - _ => { |
483 |
| - return Err(make_error( |
484 |
| - self.val, |
485 |
| - &format!("unrecognized word: {}", self.word), |
486 |
| - )) |
487 |
| - } |
| 531 | + "Struct" => Token::Struct, |
| 532 | + // If we don't recognize the word, treat it as a field name |
| 533 | + word => Token::FieldName(word.to_string()), |
488 | 534 | };
|
489 | 535 | Ok(token)
|
490 | 536 | }
|
@@ -546,6 +592,8 @@ enum Token {
|
546 | 592 | List,
|
547 | 593 | LargeList,
|
548 | 594 | FixedSizeList,
|
| 595 | + Struct, |
| 596 | + FieldName(String), |
549 | 597 | }
|
550 | 598 |
|
551 | 599 | impl Display for Token {
|
@@ -573,6 +621,8 @@ impl Display for Token {
|
573 | 621 | Token::Dictionary => write!(f, "Dictionary"),
|
574 | 622 | Token::Integer(v) => write!(f, "Integer({v})"),
|
575 | 623 | Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
|
| 624 | + Token::Struct => write!(f, "Struct"), |
| 625 | + Token::FieldName(s) => write!(f, "FieldName({s})"), |
576 | 626 | }
|
577 | 627 | }
|
578 | 628 | }
|
@@ -680,7 +730,37 @@ mod test {
|
680 | 730 | DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
|
681 | 731 | ),
|
682 | 732 | ),
|
683 |
| - // TODO support more structured types (List, LargeList, Struct, Union, Map, RunEndEncoded, etc) |
| 733 | + DataType::Struct(Fields::from(vec![ |
| 734 | + Field::new("f1", DataType::Int64, true), |
| 735 | + Field::new("f2", DataType::Float64, true), |
| 736 | + Field::new( |
| 737 | + "f3", |
| 738 | + DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())), |
| 739 | + true, |
| 740 | + ), |
| 741 | + Field::new( |
| 742 | + "f4", |
| 743 | + DataType::Dictionary( |
| 744 | + Box::new(DataType::Int8), |
| 745 | + Box::new(DataType::FixedSizeBinary(23)), |
| 746 | + ), |
| 747 | + true, |
| 748 | + ), |
| 749 | + ])), |
| 750 | + DataType::Struct(Fields::from(vec![ |
| 751 | + Field::new("Int64", DataType::Int64, true), |
| 752 | + Field::new("Float64", DataType::Float64, true), |
| 753 | + ])), |
| 754 | + DataType::Struct(Fields::from(vec![ |
| 755 | + Field::new("f1", DataType::Int64, true), |
| 756 | + Field::new( |
| 757 | + "nested_struct", |
| 758 | + DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])), |
| 759 | + true, |
| 760 | + ), |
| 761 | + ])), |
| 762 | + DataType::Struct(Fields::empty()), |
| 763 | + // TODO support more structured types (List, LargeList, Union, Map, RunEndEncoded, etc) |
684 | 764 | ]
|
685 | 765 | }
|
686 | 766 |
|
@@ -754,11 +834,13 @@ mod test {
|
754 | 834 | ("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"),
|
755 | 835 | ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
|
756 | 836 | ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),
|
757 |
| - |
| 837 | + ("Struct(f1, Int64)", "Error finding next type, got unexpected ','"), |
| 838 | + ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"), |
| 839 | + ("Struct(f1)", "Error finding next type, got unexpected ')'"), |
758 | 840 | ];
|
759 | 841 |
|
760 | 842 | for (data_type_string, expected_message) in cases {
|
761 |
| - print!("Parsing '{data_type_string}', expecting '{expected_message}'"); |
| 843 | + println!("Parsing '{data_type_string}', expecting '{expected_message}'"); |
762 | 844 | match parse_data_type(data_type_string) {
|
763 | 845 | Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
|
764 | 846 | Err(e) => {
|
|
0 commit comments