Comment on page
Iceberg
A quick guide to getting started on PuppyGraph with Apache Iceberg
The guide requires Docker. Please ensure that
docker
and docker compose
CLIs are available. See https://www.docker.com/get-started/ for more details.Docker Compose v2 is required. The version can be fetched by running:
docker compose version
Create a file
▶
docker-compose.yaml
with the following content. # reference: https://iceberg.apache.org/spark-quickstart/
version: "3"
services:
spark-iceberg:
image: tabulario/spark-iceberg
container_name: spark-iceberg
build: spark/
networks:
iceberg_net:
depends_on:
- rest
- minio
volumes:
- ./warehouse:/home/iceberg/warehouse
- ./notebooks:/home/iceberg/notebooks/notebooks
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
ports:
- 8888:8888
- 8180:8080
- 10000:10000
- 10001:10001
rest:
image: tabulario/iceberg-rest
container_name: iceberg-rest
networks:
iceberg_net:
ports:
- 8181:8181
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
- CATALOG_WAREHOUSE=s3://warehouse/
- CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
- CATALOG_S3_ENDPOINT=http://minio:9000
minio:
image: minio/minio
container_name: minio
environment:
- MINIO_ROOT_USER=admin
- MINIO_ROOT_PASSWORD=password
- MINIO_DOMAIN=minio
networks:
iceberg_net:
aliases:
- warehouse.minio
ports:
- 9001:9001
- 9000:9000
command: ["server", "/data", "--console-address", ":9001"]
mc:
depends_on:
- minio
image: minio/mc
container_name: mc
networks:
iceberg_net:
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
entrypoint: >
/bin/sh -c "
until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
/usr/bin/mc rm -r --force minio/warehouse;
/usr/bin/mc mb minio/warehouse;
/usr/bin/mc policy set public minio/warehouse;
tail -f /dev/null
"
puppygraph:
image: puppygraph/puppygraph:stable
container_name: puppygraph
networks:
iceberg_net:
environment:
- PUPPYGRAPH_USERNAME=puppygraph
- PUPPYGRAPH_PASSWORD=888888
ports:
- "8080:8080"
- "8081:8081"
- "8182:8182"
depends_on:
- spark-iceberg
networks:
iceberg_net:
name: puppy-iceberg
Then run the following command to start Iceberg and Puppygraph services:
▶
docker compose up -d
[+] Running 6/6
✔ Network puppy-iceberg Created
✔ Container minio Started
✔ Container mc Started
✔ Container iceberg-rest Started
✔ Container spark-iceberg Started
✔ Container puppygraph Started
We first prepare some data on Iceberg.
Run the following command to start a Spark-SQL shell to access Iceberg.
▶
docker exec -it spark-iceberg spark-sql
The shell will be like this:
spark-sql ()>
Then execute the following SQL statements in the shell to create tables and insert data.
▶
For your first graph, use the "Modern" graph, which looks like this:

Modern Graph
CREATE DATABASE demo.modern;
CREATE EXTERNAL TABLE demo.modern.v_person (
id string,
name string,
age int
) USING iceberg;
INSERT INTO demo.modern.v_person VALUES
('v1', 'marko', 29),
('v2', 'vadas', 27),
('v4', 'josh', 32),
('v6', 'peter', 35);
CREATE EXTERNAL TABLE demo.modern.v_software (
id string,
name string,
lang string
) USING iceberg;
INSERT INTO demo.modern.v_software VALUES
('v3', 'lop', 'java'),
('v5', 'ripple', 'java');
CREATE EXTERNAL TABLE demo.modern.e_created (
id string,
from_id string,
to_id string,
weight double
) USING iceberg;
INSERT INTO demo.modern.e_created VALUES
('e9', 'v1', 'v3', 0.4),
('e10', 'v4', 'v5', 1.0),
('e11', 'v4', 'v3', 0.4),
('e12', 'v6', 'v3', 0.2);
CREATE EXTERNAL TABLE demo.modern.e_knows (
id string,
from_id string,
to_id string,
weight double
) USING iceberg;
INSERT INTO demo.modern.e_knows VALUES
('e7', 'v1', 'v2', 0.5),
('e8', 'v1', 'v4', 1.0);
The above SQL creates the following tables:
v_person
v_software
e_knows
e_created
id | name | age |
---|---|---|
v1 | marko | 29 |
v2 | vadas | 27 |
v4 | josh | 32 |
v6 | peter | 35 |
id | name | lang |
---|---|---|
v3 | lop | java |
v5 | ripple | java |
id | from_id | to_id | weight |
---|---|---|---|
e7 | v1 | v2 | 0.5 |
e8 | v1 | v4 | 1.0 |
id | from_id | to_id | weight |
---|---|---|---|
e9 | v1 | v3 | 0.4 |
e10 | v4 | v5 | 1.0 |
e11 | v4 | v3 | 0.4 |
e12 | v6 | v3 | 0.2 |
We then define a graph on top of the iceberg tables we just created.
Create a PuppyGraph schema file
▶
iceberg.json
with the following content:{
"catalogs": [
{
"name": "iceberg_test",
"type": "iceberg",
"metastore": {
"type": "rest",
"uri": "http://iceberg-rest:8181"
},
"storage": {
"useInstanceProfile": "false",
"accessKey": "admin",
"secretKey": "password",
"enableSsl": "false",
"endpoint": "http://minio:9000",
"enablePathStyleAccess": "true"
}
}
],
"vertices": [
{
"label": "person",
"mappedTableSource": {
"catalog": "iceberg_test",
"schema": "modern",
"table": "v_person",
"metaFields": {
"id": "id"
}
},
"attributes": [
{
"type": "Int",
"name": "age"
},
{
"type": "String",
"name": "name"
}
]
},
{
"label": "software",
"mappedTableSource": {
"catalog": "iceberg_test",
"schema": "modern",
"table": "v_software",
"metaFields": {
"id": "id"
}
},
"attributes": [
{
"type": "String",
"name": "lang"
},
{
"type": "String",
"name": "name"
}
]
}
],
"edges": [
{
"label": "knows",
"mappedTableSource": {
"catalog": "iceberg_test",
"schema": "modern",
"table": "e_knows",
"metaFields": {
"id": "id",
"from": "from_id",
"to": "to_id"
}
},
"from": "person",
"to": "person",
"attributes": [
{
"type": "Double",
"name": "weight"
}
]
},
{
"label": "created",
"mappedTableSource": {
"catalog": "iceberg_test",
"schema": "modern",
"table": "e_created",
"metaFields": {
"id": "id",
"from": "from_id",
"to": "to_id"
}
},
"from": "person",
"to": "software",
"attributes": [
{
"type": "Double",
"name": "weight"
}
]
}
]
}
Run the following command to upload the schema file:
▶
curl -XPOST -H "content-type: application/json" --data-binary @./iceberg.json --user "puppygraph:888888" localhost:8081/schema
The response shows that graph schema has been uploaded successfully:
{"Status":"OK","Message":"Schema uploaded and gremlin server restarted"}
Now we can query the graph through a web-based Gremlin console provided by PuppyGraph.
Access the puppygraph CLI by running the following command
▶
docker exec -it puppygraph ./bin/puppygraph
In the PuppyGraph CLI, type in
▶
console
to start a Gremlin console. ____ ____ _
| _ \ _ _ _ __ _ __ _ _ / ___| _ __ __ _ _ __ | |__
| |_) | | | | | | '_ \ | '_ \ | | | | | | _ | '__| / _` | | '_ \ | '_ \
| __/ | |_| | | |_) | | |_) | | |_| | | |_| | | | | (_| | | |_) | | | | |
|_| \__,_| | .__/ | .__/ \__, | \____| |_| \__,_| | .__/ |_| |_|
|_| |_| |___/ |_|
Welcome to PuppyGraph, type help to see the command list
[PuppyGraph]> console
Try out the following queries in the console.
▶
g.V()
g.E()
g.V().count()
g.E().count()
g.V().outE().otherV().path()
g.V().elementMap()
g.E().elementMap()
The results are like the following:
gremlin> g.V()
==>v[software:::v5]
==>v[software:::v3]
==>v[person:::v4]
==>v[person:::v6]
==>v[person:::v1]
==>v[person:::v2]
gremlin> g.E()
==>e[created:::e10][person:::v4-created->software:::v5]
==>e[created:::e11][person:::v4-created->software:::v3]
==>e[created:::e12][person:::v6-created->software:::v3]
==>e[created:::e9][person:::v1-created->software:::v3]
==>e[knows:::e7][person:::v1-knows->person:::v2]
==>e[knows:::e8][person:::v1-knows->person:::v4]
gremlin> g.V().count()
==>6
gremlin> g.E().count()
==>6
gremlin> g.V().outE().otherV().path()
==>path[v[person:::v4], e[created:::e10][person:::v4-created->software:::v5], v[software:::v5]]
==>path[v[person:::v6], e[created:::e12][person:::v6-created->software:::v3], v[software:::v3]]
==>path[v[person:::v1], e[created:::e9][person:::v1-created->software:::v3], v[software:::v3]]
==>path[v[person:::v4], e[created:::e11][person:::v4-created->software:::v3], v[software:::v3]]
==>path[v[person:::v1], e[knows:::e7][person:::v1-knows->person:::v2], v[person:::v2]]
==>path[v[person:::v1], e[knows:::e8][person:::v1-knows->person:::v4], v[person:::v4]]
gremlin> g.V().elementMap()
==>{id=software:::v3, label=software, name=lop, lang=java}
==>{id=software:::v5, label=software, name=ripple, lang=java}
==>{id=person:::v4, label=person, name=josh, age=32}
==>{id=person:::v6, label=person, name=peter, age=35}
==>{id=person:::v1, label=person, name=marko, age=29}
==>{id=person:::v2, label=person, name=vadas, age=27}
gremlin> g.E().elementMap()
==>{id=created:::e10, label=created, IN={id=software:::v5, label=software}, OUT={id=person:::v4, label=person}, weight=1.0}
==>{id=created:::e11, label=created, IN={id=software:::v3, label=software}, OUT={id=person:::v4, label=person}, weight=0.4}
==>{id=created:::e12, label=created, IN={id=software:::v3, label=software}, OUT={id=person:::v6, label=person}, weight=0.2}
==>{id=created:::e9, label=created, IN={id=software:::v3, label=software}, OUT={id=person:::v1, label=person}, weight=0.4}
==>{id=knows:::e7, label=knows, IN={id=person:::v2, label=person}, OUT={id=person:::v1, label=person}, weight=0.5}
==>{id=knows:::e8, label=knows, IN={id=person:::v4, label=person}, OUT={id=person:::v1, label=person}, weight=1.0}
gremlin>
To exit the Gremlin Console, type in and press enter
▶
:exit
To exit PuppyGraph CLI, type in and press enter
▶
exit
Run the following command to shut down and remove the services:
▶
docker compose down
Please refer to Connecting to Apache Iceberg for connecting to different implementations of Iceberg.
Last modified 3d ago