Skip to content

Commit e823163

Browse files
authored
doc: Document local HDFS setup (#1673)
* doc: Document local HDFS setup * doc: Document local HDFS setup
1 parent cfc4cbb commit e823163

File tree

3 files changed

+167
-0
lines changed

3 files changed

+167
-0
lines changed

docs/source/user-guide/datasources.md

+36
Original file line numberDiff line numberDiff line change
@@ -111,5 +111,41 @@ Verify the native scan type should be `CometNativeScan`.
111111

112112
More on [HDFS Reader](../../../native/hdfs/README.md)
113113

114+
### Local HDFS development
115+
116+
- Configure local machine network. Add hostname to `/etc/hosts`
117+
```commandline
118+
127.0.0.1 localhost namenode datanode1 datanode2 datanode3
119+
::1 localhost namenode datanode1 datanode2 datanode3
120+
```
121+
122+
- Start local HDFS cluster, 3 datanodes, namenode url is `namenode:9000`
123+
```commandline
124+
docker compose -f kube/local/hdfs-docker-compose.yml up
125+
```
126+
127+
- Check the local namenode is up and running on `http://localhost:9870/dfshealth.html#tab-overview`
128+
- Build a project with HDFS support
129+
```commandline
130+
JAVA_HOME="/opt/homebrew/opt/openjdk@11" make release PROFILES="-Pspark-3.5" COMET_FEATURES=hdfs RUSTFLAGS="-L /opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home/lib/server"
131+
```
132+
133+
- Run local test
134+
```scala
135+
136+
withSQLConf(
137+
CometConf.COMET_ENABLED.key -> "true",
138+
CometConf.COMET_EXEC_ENABLED.key -> "true",
139+
CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_DATAFUSION,
140+
SQLConf.USE_V1_SOURCE_LIST.key -> "parquet",
141+
"fs.defaultFS" -> "hdfs://namenode:9000",
142+
"dfs.client.use.datanode.hostname" -> "true") {
143+
val df = spark.read.parquet("/tmp/2")
144+
df.show(false)
145+
df.explain("extended")
146+
}
147+
}
148+
```
149+
Or use `spark-shell` with HDFS support as described [above](#using-experimental-native-datafusion-reader)
114150
## S3
115151
In progress

kube/local/hadoop.env

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
CORE_CONF_fs_defaultFS=hdfs://namenode:9000
19+
CORE_CONF_hadoop_http_staticuser_user=root
20+
CORE_CONF_hadoop_proxyuser_hue_hosts=*
21+
CORE_CONF_hadoop_proxyuser_hue_groups=*
22+
CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec
23+
CORE_CONF_hadoop_tmp_dir=/hadoop-data
24+
CORE_CONF_dfs_client_use_datanode_hostname=true
25+
CORE_CONF_dfs_datanode_use_datanode_hostname=true
26+
27+
HDFS_CONF_dfs_webhdfs_enabled=true
28+
HDFS_CONF_dfs_permissions_enabled=false
29+
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
30+
HDFS_CONF_dfs_client_use_datanode_hostname=true
31+
HDFS_CONF_dfs_datanode_use_datanode_hostname=true

kube/local/hdfs-docker-compose.yml

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
version: "3"
19+
20+
services:
21+
namenode:
22+
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
23+
container_name: namenode
24+
restart: always
25+
ports:
26+
- 9870:9870
27+
- 9000:9000
28+
volumes:
29+
- /tmp/hadoop/dfs/name:/hadoop/dfs/name
30+
environment:
31+
- CLUSTER_NAME=test
32+
env_file:
33+
- hadoop.env
34+
35+
datanode1:
36+
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
37+
container_name: datanode1
38+
hostname: datanode1
39+
restart: always
40+
ports:
41+
- 9866:9866
42+
- 9864:9864
43+
depends_on:
44+
- namenode
45+
volumes:
46+
- /tmp/hadoop/dfs/data1:/hadoop/dfs/data
47+
environment:
48+
SERVICE_PRECONDITION: "namenode:9870"
49+
env_file:
50+
- hadoop.env
51+
datanode2:
52+
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
53+
container_name: datanode2
54+
hostname: datanode2
55+
restart: always
56+
ports:
57+
- 9867:9866
58+
- 9865:9864
59+
depends_on:
60+
- namenode
61+
volumes:
62+
- /tmp/hadoop/dfs/data2:/hadoop/dfs/data
63+
environment:
64+
SERVICE_PRECONDITION: "namenode:9870"
65+
env_file:
66+
- hadoop.env
67+
datanode3:
68+
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
69+
container_name: datanode3
70+
hostname: datanode3
71+
restart: always
72+
ports:
73+
- 9868:9866
74+
- 9863:9864
75+
depends_on:
76+
- namenode
77+
volumes:
78+
- /tmp/hadoop/dfs/data3:/hadoop/dfs/data
79+
environment:
80+
SERVICE_PRECONDITION: "namenode:9870"
81+
env_file:
82+
- hadoop.env
83+
84+
resourcemanager:
85+
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
86+
container_name: resourcemanager
87+
restart: always
88+
environment:
89+
SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode1:9864 datanode2:9864 datanode3:9864 datanode1:9866 datanode2:9866 datanode3:9866"
90+
env_file:
91+
- hadoop.env
92+
93+
nodemanager1:
94+
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
95+
container_name: nodemanager
96+
restart: always
97+
environment:
98+
SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode1:9864 datanode2:9864 datanode3:9864 datanode1:9866 datanode2:9866 datanode3:9866 resourcemanager:8088"
99+
env_file:
100+
- hadoop.env

0 commit comments

Comments
 (0)