tidy up parquet from lib

This commit is contained in:
2021-02-05 12:30:01 +01:00
parent b869c1df3f
commit 09011b50ba
19 changed files with 0 additions and 2909 deletions

View File

@@ -0,0 +1,133 @@
#-----------------------------------------------------------------------------#
import argparse
import os
#-----------------------------------------------------------------------------#
parser = argparse.ArgumentParser(description='List all source dependencies')
#parser.add_argument('pathToRepo',type=str,help='path of source repository')
parser.add_argument('mainSource',type=str,help='main source file')
parser.add_argument('depFile',type=str,help='file listing all dependencies')
args = parser.parse_args()
libpaths = ["/home/mario/Desktop/arrow/cpp/src/",
"/home/mario/Desktop/arrow/cpp/thrift_ep-install/include/",
"/home/mario/Desktop/arrow/cpp/boost_ep-prefix/src/boost_ep/"]
#-----------------------------------------------------------------------------#
def find_dependencies(srcfile, recdepth, cdeplist) :
"""
Given a source file and its dependencies in the given repository path
list all further dependencies recursively
Args:
srcfile (string): path/name of source file
recdepth (integer): current recursion depth
cdeplist (list): current list of dependencies
Return:
deps (list): list of source files in repository, the source file depends on
"""
# define indentation to visual recursion
indent = recdepth*(" ")
print("\n" + indent + "find_dependencies:"
+ "\n" + indent + "1: " + srcfile
+ "\n" + indent + "2: " + str(recdepth)
+ "\n" + indent + "3: " + str(len(cdeplist)) + "\n")
# show dependencies so far
#print(cdeplist)
# generate dependencies by means of g++
libdeps = (" -I ").join(libpaths)
cmd = "g++ -c -MMD " + srcfile + " -I " + libdeps
print(indent + cmd )
os.system(cmd)
# open dependency file and extract list of sources
basename = srcfile.split('/')[-1].split('.')[0]
depfile = basename + '.d'
print(indent + "reading dependency file " + depfile)
with open(depfile,'r') as fin :
depslist = fin.readlines()
# delete dependencies and object files
os.system("rm " + basename + ".d")
os.system("rm " + basename + ".o")
# remove first line
depslist = depslist[1:]
# delete leading space and trailing backslash
depslistcl = [dep.lstrip(' ').rstrip(' \\\n') for dep in depslist]
# collect dependencies
newdeps = []
# check all dependencies recursively and collect further dependencies
count = 0
for dep in depslistcl :
# append source itself to list
if dep not in cdeplist :
print(indent + "adding dependency " + dep)
newdeps.append(dep)
count = count + 1
print(indent + "=> added " + str(count) + "/" + str(len(depslistcl)) )
# check recursion depth
if recdepth < 20 :
# check all dependencies of every single dependency
for dep in depslistcl :
# try to find corresponding *.cc, (*.cpp) file
depcc = dep.split('.')[0] + '.cc'
print(indent + "checking for " + depcc)
if os.path.exists(depcc) :
if depcc not in cdeplist and depcc not in newdeps :
# add file itself as dependency
newdeps.append(depcc)
# find dependencies of single source
newrecdeps = find_dependencies(depcc,recdepth+1,cdeplist+newdeps)
# append to list
for el in newrecdeps :
if el not in newdeps :
newdeps.append(el)
else :
print(indent + "already in list")
else :
print(indent + "does not exist")
print("\n")
# provide list of dependencies
return newdeps
#-----------------------------------------------------------------------------#
if __name__== "__main__":
print("\nCLI arguments:\n" + str(args) + "\n")
# collect list of dependencies
deps = []
# start recursion with given source file
deps = find_dependencies(args.mainSource,0,[])
print("\nfinal list of dependencies: (" + str(len(deps)) + ")\n")
print(deps)
print("\n")
# remove any duplicates
depsuni = set(deps)
print("\nfinal set of dependencies: (" + str(len(depsuni)) + ")\n")
print(depsuni)
print("\n")
# write list of dependencies
with open(args.depFile,'w') as fout :
for el in depsuni :
fout.write(str(el) + '\n')
#-----------------------------------------------------------------------------#

View File

@@ -0,0 +1,23 @@
#-----------------------------------------------------------------------------#
import glob
from pathlib import Path
# find source files
srcpaths = Path("src/").rglob('*.cc')
deps =[ str(path) for path in srcpaths ]
print(deps)
with open('makefileobj','w') as fout :
for el in deps :
basnam = el.split('/')[-1]
print(str(el) + " : " + str(basnam) + " : " + str(basnam.split('.')[1]))
if basnam.split('.')[1] == 'cc' :
objfile = 'bin/' + basnam.replace('.cc','.o')
fout.write(objfile + " : " + el + "\n")
fout.write("\t" + "$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@\n")
fout.write("\n")
#-----------------------------------------------------------------------------#

356
parquet/parquet/makefile Normal file
View File

@@ -0,0 +1,356 @@
#-----------------------------------------------------------------------------#
CPP := g++ -std=c++14
CPPFLAGS := -Woverflow -Wpedantic -Wextra -Waddress -Waligned-new -Walloc-zero
SRC := src/
BIN := bin/
LIBS := -I src/src/ -I src/thrift_ep-install/include/ -I src/boost_ep-prefix/src/boost_ep/
#-----------------------------------------------------------------------------#
# prepare source
#
# before: $ cd arrow/cpp/ and compile relevant sources by
# $ cmake . -D ARROW_PARQUET=ON -D PARQUET_BUILD_EXAMPLES=ON -D ARROW_WITH_SNAPPY=ON
# $ cmake .. -D ARROW_PARQUET=ON ARROW_BUILD_EXAMPLES=ON
lib :
cmake . -D ARROW_WITH_BROTLI=ON -D ARROW_WITH_BZ2=ON -D ARROW_WITH_LZ4=ON -D ARROW_WITH_SNAPPY=ON -D ARROW_WITH_ZLIB=ON -D ARROW_PARQUET=ON -D ARROW_PYTHON=ON
# cp-src : deps.log
# ./src_copy.sh
deps.log :
python3 generate_deps.py reader-writer.cc $@
SRC := $(shell find $(SRC) -name '*.cc')
# OBJ := $(apprefix obj/, $(SRC:%.cc=%.o))
OBJ := $(addprefix $(BIN),$(notdir $(SRC:%.cc=%.o)))
check :
@echo $(SRC)
@echo $(OBJ)
# vpath %.cc src/
reader-writer-example : reader-writer.cc $(OBJ) bin/utilmemory.o
$(CPP) $(CPPFLAGS) $< $(LIBS) -o $@ $(OBJ) bin/utilmemory.o
# $(OBJ) : $(SRC)
# $(CPP) $(OPT) -c $< -o $@ -I src/src/
#
# $(BIN)%.o : $(SRC)
# $(CPP) $(OPT) -c $< -I src/src/ -o $@
clean-obj :
rm -f $(OBJ)
# => do build with cmake like here
# https://arrow.apache.org/docs/developers/python.html#build-and-test
#-----------------------------------------------------------------------------#
bin/type.o : src/src/arrow/type.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/result.o : src/src/arrow/result.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder.o : src/src/arrow/builder.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/tensor.o : src/src/arrow/tensor.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/table.o : src/src/arrow/table.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/extension_type.o : src/src/arrow/extension_type.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/device.o : src/src/arrow/device.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/memory_pool.o : src/src/arrow/memory_pool.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/datum.o : src/src/arrow/datum.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/record_batch.o : src/src/arrow/record_batch.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/compare.o : src/src/arrow/compare.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/visitor.o : src/src/arrow/visitor.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/chunked_array.o : src/src/arrow/chunked_array.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/status.o : src/src/arrow/status.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/pretty_print.o : src/src/arrow/pretty_print.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/sparse_tensor.o : src/src/arrow/sparse_tensor.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/buffer.o : src/src/arrow/buffer.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/scalar.o : src/src/arrow/scalar.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/string.o : src/src/arrow/util/string.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/utilmemory.o : src/src/arrow/util/memory.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/future.o : src/src/arrow/util/future.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/iterator.o : src/src/arrow/util/iterator.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/compression.o : src/src/arrow/util/compression.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/utf8.o : src/src/arrow/util/utf8.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/time.o : src/src/arrow/util/time.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/cpu_info.o : src/src/arrow/util/cpu_info.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/thread_pool.o : src/src/arrow/util/thread_pool.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bit_util.o : src/src/arrow/util/bit_util.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/logging.o : src/src/arrow/util/logging.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/basic_decimal.o : src/src/arrow/util/basic_decimal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/decimal.o : src/src/arrow/util/decimal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bit_block_counter.o : src/src/arrow/util/bit_block_counter.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/key_value_metadata.o : src/src/arrow/util/key_value_metadata.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/int_util.o : src/src/arrow/util/int_util.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/io_util.o : src/src/arrow/util/io_util.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bitmap_ops.o : src/src/arrow/util/bitmap_ops.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bitmap_builders.o : src/src/arrow/util/bitmap_builders.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bit_run_reader.o : src/src/arrow/util/bit_run_reader.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/value_parsing.o : src/src/arrow/util/value_parsing.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/string_builder.o : src/src/arrow/util/string_builder.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/formatting.o : src/src/arrow/util/formatting.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_primitive.o : src/src/arrow/array/array_primitive.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_dict.o : src/src/arrow/array/array_dict.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_binary.o : src/src/arrow/array/builder_binary.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_union.o : src/src/arrow/array/builder_union.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/concatenate.o : src/src/arrow/array/concatenate.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_nested.o : src/src/arrow/array/array_nested.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_decimal.o : src/src/arrow/array/array_decimal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_primitive.o : src/src/arrow/array/builder_primitive.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/data.o : src/src/arrow/array/data.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/diff.o : src/src/arrow/array/diff.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_nested.o : src/src/arrow/array/builder_nested.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_decimal.o : src/src/arrow/array/builder_decimal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_dict.o : src/src/arrow/array/builder_dict.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_binary.o : src/src/arrow/array/array_binary.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_adaptive.o : src/src/arrow/array/builder_adaptive.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_base.o : src/src/arrow/array/array_base.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/validate.o : src/src/arrow/array/validate.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_base.o : src/src/arrow/array/builder_base.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/util.o : src/src/arrow/array/util.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/caching.o : src/src/arrow/io/caching.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/memory.o : src/src/arrow/io/memory.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/interfaces.o : src/src/arrow/io/interfaces.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/buffered.o : src/src/arrow/io/buffered.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/file.o : src/src/arrow/io/file.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/strtod.o : src/src/arrow/vendored/double-conversion/strtod.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bignum.o : src/src/arrow/vendored/double-conversion/bignum.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/fixed-dtoa.o : src/src/arrow/vendored/double-conversion/fixed-dtoa.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/fast-dtoa.o : src/src/arrow/vendored/double-conversion/fast-dtoa.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/diy-fp.o : src/src/arrow/vendored/double-conversion/diy-fp.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/double-conversion.o : src/src/arrow/vendored/double-conversion/double-conversion.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bignum-dtoa.o : src/src/arrow/vendored/double-conversion/bignum-dtoa.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/cached-powers.o : src/src/arrow/vendored/double-conversion/cached-powers.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/api_aggregate.o : src/src/arrow/compute/api_aggregate.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/exec.o : src/src/arrow/compute/exec.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/kernel.o : src/src/arrow/compute/kernel.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/registry.o : src/src/arrow/compute/registry.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/function.o : src/src/arrow/compute/function.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/cast.o : src/src/arrow/compute/cast.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/api_vector.o : src/src/arrow/compute/api_vector.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/api_scalar.o : src/src/arrow/compute/api_scalar.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/codegen_internal.o : src/src/arrow/compute/kernels/codegen_internal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/column_scanner.o : src/src/parquet/column_scanner.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/statistics.o : src/src/parquet/statistics.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/internal_file_decryptor.o : src/src/parquet/internal_file_decryptor.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/column_writer.o : src/src/parquet/column_writer.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/encryption.o : src/src/parquet/encryption.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/file_reader.o : src/src/parquet/file_reader.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/properties.o : src/src/parquet/properties.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/encryption_internal.o : src/src/parquet/encryption_internal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/internal_file_encryptor.o : src/src/parquet/internal_file_encryptor.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/types.o : src/src/parquet/types.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/encoding.o : src/src/parquet/encoding.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/metadata.o : src/src/parquet/metadata.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/printer.o : src/src/parquet/printer.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/level_conversion.o : src/src/parquet/level_conversion.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/deprecated_io.o : src/src/parquet/deprecated_io.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/file_writer.o : src/src/parquet/file_writer.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/schema.o : src/src/parquet/schema.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/platform.o : src/src/parquet/platform.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/column_reader.o : src/src/parquet/column_reader.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@

View File

@@ -0,0 +1,96 @@
#-----------------------------------------------------------------------------#
PARQUETDIR := /home/mario/Desktop/Record_Evolution/parquet-cpp
ARROWDIR := /home/mario/Desktop/Record_Evolution/arrow/cpp/src
CPP := g++ -std=c++14
OPT := -Wall -Woverflow -Wpedantic -Wextra -Waddress -Waligned-new -Walloc-zero
prepare : collect_parquet modify_parquet collect_arrow modify_arrow
collect_parquet :
cp -r $(PARQUETDIR)/src/parquet ./
cp $(PARQUETDIR)/examples/low-level-api/reader_writer.h ./
cp $(PARQUETDIR)/examples/low-level-api/reader-writer.cc ./
modify_parquet :
cp parquet/parquet_version.h.in parquet/parquet_version.h
sed -i 's/ReadableFileInterface/ReadWriteFileInterface/g' parquet/util/memory.h
sed -i 's/ReadableFileInterface/ReadWriteFileInterface/g' parquet/file_reader.h
sed -i 's/arrow::Codec/arrow::util::Codec/g' parquet/util/memory.h
sed -i 's/valid_bits_writer/valid_bits_offset/g' parquet/column_reader.h
collect_arrow :
cp -r $(ARROWDIR)/arrow ./
modify_arrow :
cp arrow/util/bit_util.h arrow/util/bit-util.h
collect_test :
cp $(PARQUETDIR)/examples/low-level-api/reader-writer.cc ./
subst :
sed -i 's/#include \"arrow\//\/\/#include \"arrow/g' parquet/properties.h
test :
$(CPP) $(OPT) -I$(PWD) reader-writer.cc
clean :
rm -r parquet/ arrow/
rm reader-writer.cc reader_writer.h
#-----------------------------------------------------------------------------#
# choose shell
SHELL:=/bin/bash
SRC = reader-writer
# specify path of cloned directory
ARROWGIT := /home/mario/Desktop/Record_Evolution/arrow
filewriter : parquet/file_writer.cc
$(CPP) -c $(OPT) $<
# build executable (and generate dependency file)
readwrite : reader-writer.cc
$(CPP) $(OPT) -MMD $< -I ./
# generate dependency file
$(SRC).d : $(SRC).cc
$(CPP) -c -MMD $< -I ./ -I $(ARROWGIT)/cpp/src/
# extract source dependencies
extract-dep : $(SRC).d
@# extract relevant dependencies
cat $< | sed 's/ /\n/g' | awk 'NF' | grep -v '\\' | grep '\/' > deps.log
cat deps.log | sed ':a;N;$!ba;s/\n/ /g' > headers.log
cat headers.log | sed 's/.h$$/.cc/g' > sources.log
@# copy required sources
mkdir -p temp/
cp --parents `cat headers.log` temp/
cp --parents `cat sources.log` temp/ 2>/dev/null
mv temp$(ARROWGIT)/cpp/src/* ./
rm -r temp
clean-dep :
rm -f deps.log headers.log sources.log $(SRC).d
#-----------------------------------------------------------------------------#
# only use more recent and up to date repository arrow.git
# build arrow shared/static libraries
build :
cd arrow/cpp
# cmake -LA to show all options
cmake . -D ARROW_PARQUET=ON #ARROW_ARMV8_ARCH=armv8-a
make
example :
cd arrow/cpp/examples/parquet/low-level-api/
g++ reader-writer.cc -I. -I../../../src/ -L../../../../cpp/build/release/ -larrow -lparquet
# set environment variable LD_LIBRARY_PATH=../../../../cpp/build/release/ before launching executable
#------------------------------------------------------------------------------------#

303
parquet/parquet/makefileobj Normal file
View File

@@ -0,0 +1,303 @@
bin/type.o : src/src/arrow/type.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/result.o : src/src/arrow/result.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder.o : src/src/arrow/builder.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/tensor.o : src/src/arrow/tensor.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/table.o : src/src/arrow/table.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/extension_type.o : src/src/arrow/extension_type.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/device.o : src/src/arrow/device.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/memory_pool.o : src/src/arrow/memory_pool.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/datum.o : src/src/arrow/datum.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/record_batch.o : src/src/arrow/record_batch.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/compare.o : src/src/arrow/compare.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/visitor.o : src/src/arrow/visitor.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/chunked_array.o : src/src/arrow/chunked_array.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/status.o : src/src/arrow/status.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/pretty_print.o : src/src/arrow/pretty_print.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/sparse_tensor.o : src/src/arrow/sparse_tensor.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/buffer.o : src/src/arrow/buffer.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/scalar.o : src/src/arrow/scalar.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/string.o : src/src/arrow/util/string.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/memory.o : src/src/arrow/util/memory.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/future.o : src/src/arrow/util/future.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/iterator.o : src/src/arrow/util/iterator.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/compression.o : src/src/arrow/util/compression.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/utf8.o : src/src/arrow/util/utf8.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/time.o : src/src/arrow/util/time.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/cpu_info.o : src/src/arrow/util/cpu_info.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/thread_pool.o : src/src/arrow/util/thread_pool.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bit_util.o : src/src/arrow/util/bit_util.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/logging.o : src/src/arrow/util/logging.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/basic_decimal.o : src/src/arrow/util/basic_decimal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/decimal.o : src/src/arrow/util/decimal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bit_block_counter.o : src/src/arrow/util/bit_block_counter.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/key_value_metadata.o : src/src/arrow/util/key_value_metadata.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/int_util.o : src/src/arrow/util/int_util.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/io_util.o : src/src/arrow/util/io_util.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bitmap_ops.o : src/src/arrow/util/bitmap_ops.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bitmap_builders.o : src/src/arrow/util/bitmap_builders.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bit_run_reader.o : src/src/arrow/util/bit_run_reader.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/value_parsing.o : src/src/arrow/util/value_parsing.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/string_builder.o : src/src/arrow/util/string_builder.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/formatting.o : src/src/arrow/util/formatting.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_primitive.o : src/src/arrow/array/array_primitive.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_dict.o : src/src/arrow/array/array_dict.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_binary.o : src/src/arrow/array/builder_binary.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_union.o : src/src/arrow/array/builder_union.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/concatenate.o : src/src/arrow/array/concatenate.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_nested.o : src/src/arrow/array/array_nested.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_decimal.o : src/src/arrow/array/array_decimal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_primitive.o : src/src/arrow/array/builder_primitive.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/data.o : src/src/arrow/array/data.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/diff.o : src/src/arrow/array/diff.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_nested.o : src/src/arrow/array/builder_nested.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_decimal.o : src/src/arrow/array/builder_decimal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_dict.o : src/src/arrow/array/builder_dict.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_binary.o : src/src/arrow/array/array_binary.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_adaptive.o : src/src/arrow/array/builder_adaptive.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/array_base.o : src/src/arrow/array/array_base.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/validate.o : src/src/arrow/array/validate.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/builder_base.o : src/src/arrow/array/builder_base.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/util.o : src/src/arrow/array/util.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/caching.o : src/src/arrow/io/caching.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/memory.o : src/src/arrow/io/memory.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/interfaces.o : src/src/arrow/io/interfaces.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/buffered.o : src/src/arrow/io/buffered.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/file.o : src/src/arrow/io/file.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/strtod.o : src/src/arrow/vendored/double-conversion/strtod.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bignum.o : src/src/arrow/vendored/double-conversion/bignum.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/fixed-dtoa.o : src/src/arrow/vendored/double-conversion/fixed-dtoa.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/fast-dtoa.o : src/src/arrow/vendored/double-conversion/fast-dtoa.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/diy-fp.o : src/src/arrow/vendored/double-conversion/diy-fp.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/double-conversion.o : src/src/arrow/vendored/double-conversion/double-conversion.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/bignum-dtoa.o : src/src/arrow/vendored/double-conversion/bignum-dtoa.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/cached-powers.o : src/src/arrow/vendored/double-conversion/cached-powers.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/api_aggregate.o : src/src/arrow/compute/api_aggregate.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/exec.o : src/src/arrow/compute/exec.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/kernel.o : src/src/arrow/compute/kernel.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/registry.o : src/src/arrow/compute/registry.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/function.o : src/src/arrow/compute/function.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/cast.o : src/src/arrow/compute/cast.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/api_vector.o : src/src/arrow/compute/api_vector.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/api_scalar.o : src/src/arrow/compute/api_scalar.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/codegen_internal.o : src/src/arrow/compute/kernels/codegen_internal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/column_scanner.o : src/src/parquet/column_scanner.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/statistics.o : src/src/parquet/statistics.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/internal_file_decryptor.o : src/src/parquet/internal_file_decryptor.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/column_writer.o : src/src/parquet/column_writer.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/encryption.o : src/src/parquet/encryption.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/file_reader.o : src/src/parquet/file_reader.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/properties.o : src/src/parquet/properties.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/encryption_internal.o : src/src/parquet/encryption_internal.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/internal_file_encryptor.o : src/src/parquet/internal_file_encryptor.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/types.o : src/src/parquet/types.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/encoding.o : src/src/parquet/encoding.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/metadata.o : src/src/parquet/metadata.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/printer.o : src/src/parquet/printer.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/level_conversion.o : src/src/parquet/level_conversion.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/deprecated_io.o : src/src/parquet/deprecated_io.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/file_writer.o : src/src/parquet/file_writer.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/schema.o : src/src/parquet/schema.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/platform.o : src/src/parquet/platform.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@
bin/column_reader.o : src/src/parquet/column_reader.cc
$(CPP) $(CPPFLAGS) -c $< $(LIBS) -o $@

View File

@@ -0,0 +1,413 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <cassert>
#include <fstream>
#include <iostream>
#include <memory>
#include "reader_writer.h"
/*
* This example describes writing and reading Parquet Files in C++ and serves as a
* reference to the API.
* The file contains all the physical data types supported by Parquet.
* This example uses the RowGroupWriter API that supports writing RowGroups optimized for
*memory consumption
**/
/* Parquet is a structured columnar file format
* Parquet File = "Parquet data" + "Parquet Metadata"
* "Parquet data" is simply a vector of RowGroups. Each RowGroup is a batch of rows in a
* columnar layout
* "Parquet Metadata" contains the "file schema" and attributes of the RowGroups and their
* Columns
* "file schema" is a tree where each node is either a primitive type (leaf nodes) or a
* complex (nested) type (internal nodes)
* For specific details, please refer the format here:
* https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
**/
constexpr int NUM_ROWS_PER_ROW_GROUP = 500;
const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet";
int main(int argc, char** argv) {
/**********************************************************************************
PARQUET WRITER EXAMPLE
**********************************************************************************/
// parquet::REQUIRED fields do not need definition and repetition level values
// parquet::OPTIONAL fields require only definition level values
// parquet::REPEATED fields require both definition and repetition level values
try {
// Create a local file output stream instance.
using FileClass = ::arrow::io::FileOutputStream;
std::shared_ptr<FileClass> out_file;
PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(PARQUET_FILENAME));
// Setup the parquet schema
std::shared_ptr<GroupNode> schema = SetupSchema();
// Add writer properties
parquet::WriterProperties::Builder builder;
builder.compression(parquet::Compression::UNCOMPRESSED);
std::shared_ptr<parquet::WriterProperties> props = builder.build();
// Create a ParquetFileWriter instance
std::shared_ptr<parquet::ParquetFileWriter> file_writer =
parquet::ParquetFileWriter::Open(out_file, schema, props);
// Append a RowGroup with a specific number of rows.
parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup();
// Write the Bool column
parquet::BoolWriter* bool_writer =
static_cast<parquet::BoolWriter*>(rg_writer->NextColumn());
for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
bool value = ((i % 2) == 0) ? true : false;
bool_writer->WriteBatch(1, nullptr, nullptr, &value);
}
// Write the Int32 column
parquet::Int32Writer* int32_writer =
static_cast<parquet::Int32Writer*>(rg_writer->NextColumn());
for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
int32_t value = i;
int32_writer->WriteBatch(1, nullptr, nullptr, &value);
}
// Write the Int64 column. Each row has repeats twice.
parquet::Int64Writer* int64_writer =
static_cast<parquet::Int64Writer*>(rg_writer->NextColumn());
for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) {
int64_t value = i * 1000 * 1000;
value *= 1000 * 1000;
int16_t definition_level = 1;
int16_t repetition_level = 0;
if ((i % 2) == 0) {
repetition_level = 1; // start of a new record
}
int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value);
}
// Write the INT96 column.
parquet::Int96Writer* int96_writer =
static_cast<parquet::Int96Writer*>(rg_writer->NextColumn());
for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
parquet::Int96 value;
value.value[0] = i;
value.value[1] = i + 1;
value.value[2] = i + 2;
int96_writer->WriteBatch(1, nullptr, nullptr, &value);
}
// Write the Float column
parquet::FloatWriter* float_writer =
static_cast<parquet::FloatWriter*>(rg_writer->NextColumn());
for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
float value = static_cast<float>(i) * 1.1f;
float_writer->WriteBatch(1, nullptr, nullptr, &value);
}
// Write the Double column
parquet::DoubleWriter* double_writer =
static_cast<parquet::DoubleWriter*>(rg_writer->NextColumn());
for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
double value = i * 1.1111111;
double_writer->WriteBatch(1, nullptr, nullptr, &value);
}
// Write the ByteArray column. Make every alternate values NULL
parquet::ByteArrayWriter* ba_writer =
static_cast<parquet::ByteArrayWriter*>(rg_writer->NextColumn());
for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
parquet::ByteArray value;
char hello[FIXED_LENGTH] = "parquet";
hello[7] = static_cast<char>(static_cast<int>('0') + i / 100);
hello[8] = static_cast<char>(static_cast<int>('0') + (i / 10) % 10);
hello[9] = static_cast<char>(static_cast<int>('0') + i % 10);
if (i % 2 == 0) {
int16_t definition_level = 1;
value.ptr = reinterpret_cast<const uint8_t*>(&hello[0]);
value.len = FIXED_LENGTH;
ba_writer->WriteBatch(1, &definition_level, nullptr, &value);
} else {
int16_t definition_level = 0;
ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr);
}
}
// Write the FixedLengthByteArray column
parquet::FixedLenByteArrayWriter* flba_writer =
static_cast<parquet::FixedLenByteArrayWriter*>(rg_writer->NextColumn());
for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
parquet::FixedLenByteArray value;
char v = static_cast<char>(i);
char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
value.ptr = reinterpret_cast<const uint8_t*>(&flba[0]);
flba_writer->WriteBatch(1, nullptr, nullptr, &value);
}
// Close the ParquetFileWriter
file_writer->Close();
// Write the bytes to file
DCHECK(out_file->Close().ok());
} catch (const std::exception& e) {
std::cerr << "Parquet write error: " << e.what() << std::endl;
return -1;
}
/**********************************************************************************
PARQUET READER EXAMPLE
**********************************************************************************/
try {
// Create a ParquetReader instance
std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false);
// Get the File MetaData
std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
// Get the number of RowGroups
int num_row_groups = file_metadata->num_row_groups();
assert(num_row_groups == 1);
// Get the number of Columns
int num_columns = file_metadata->num_columns();
assert(num_columns == 8);
// Iterate over all the RowGroups in the file
for (int r = 0; r < num_row_groups; ++r) {
// Get the RowGroup Reader
std::shared_ptr<parquet::RowGroupReader> row_group_reader =
parquet_reader->RowGroup(r);
int64_t values_read = 0;
int64_t rows_read = 0;
int16_t definition_level;
int16_t repetition_level;
int i;
std::shared_ptr<parquet::ColumnReader> column_reader;
ARROW_UNUSED(rows_read); // prevent warning in release build
// Get the Column Reader for the boolean column
column_reader = row_group_reader->Column(0);
parquet::BoolReader* bool_reader =
static_cast<parquet::BoolReader*>(column_reader.get());
// Read all the rows in the column
i = 0;
while (bool_reader->HasNext()) {
bool value;
// Read one value at a time. The number of rows read is returned. values_read
// contains the number of non-null rows
rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
// Ensure only one value is read
assert(rows_read == 1);
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
bool expected_value = ((i % 2) == 0) ? true : false;
assert(value == expected_value);
i++;
}
// Get the Column Reader for the Int32 column
column_reader = row_group_reader->Column(1);
parquet::Int32Reader* int32_reader =
static_cast<parquet::Int32Reader*>(column_reader.get());
// Read all the rows in the column
i = 0;
while (int32_reader->HasNext()) {
int32_t value;
// Read one value at a time. The number of rows read is returned. values_read
// contains the number of non-null rows
rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
// Ensure only one value is read
assert(rows_read == 1);
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
assert(value == i);
i++;
}
// Get the Column Reader for the Int64 column
column_reader = row_group_reader->Column(2);
parquet::Int64Reader* int64_reader =
static_cast<parquet::Int64Reader*>(column_reader.get());
// Read all the rows in the column
i = 0;
while (int64_reader->HasNext()) {
int64_t value;
// Read one value at a time. The number of rows read is returned. values_read
// contains the number of non-null rows
rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level,
&value, &values_read);
// Ensure only one value is read
assert(rows_read == 1);
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
int64_t expected_value = i * 1000 * 1000;
expected_value *= 1000 * 1000;
assert(value == expected_value);
if ((i % 2) == 0) {
assert(repetition_level == 1);
} else {
assert(repetition_level == 0);
}
i++;
}
// Get the Column Reader for the Int96 column
column_reader = row_group_reader->Column(3);
parquet::Int96Reader* int96_reader =
static_cast<parquet::Int96Reader*>(column_reader.get());
// Read all the rows in the column
i = 0;
while (int96_reader->HasNext()) {
parquet::Int96 value;
// Read one value at a time. The number of rows read is returned. values_read
// contains the number of non-null rows
rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
// Ensure only one value is read
assert(rows_read == 1);
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
parquet::Int96 expected_value;
ARROW_UNUSED(expected_value); // prevent warning in release build
expected_value.value[0] = i;
expected_value.value[1] = i + 1;
expected_value.value[2] = i + 2;
for (int j = 0; j < 3; j++) {
assert(value.value[j] == expected_value.value[j]);
}
i++;
}
// Get the Column Reader for the Float column
column_reader = row_group_reader->Column(4);
parquet::FloatReader* float_reader =
static_cast<parquet::FloatReader*>(column_reader.get());
// Read all the rows in the column
i = 0;
while (float_reader->HasNext()) {
float value;
// Read one value at a time. The number of rows read is returned. values_read
// contains the number of non-null rows
rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
// Ensure only one value is read
assert(rows_read == 1);
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
float expected_value = static_cast<float>(i) * 1.1f;
assert(value == expected_value);
i++;
}
// Get the Column Reader for the Double column
column_reader = row_group_reader->Column(5);
parquet::DoubleReader* double_reader =
static_cast<parquet::DoubleReader*>(column_reader.get());
// Read all the rows in the column
i = 0;
while (double_reader->HasNext()) {
double value;
// Read one value at a time. The number of rows read is returned. values_read
// contains the number of non-null rows
rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
// Ensure only one value is read
assert(rows_read == 1);
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
double expected_value = i * 1.1111111;
assert(value == expected_value);
i++;
}
// Get the Column Reader for the ByteArray column
column_reader = row_group_reader->Column(6);
parquet::ByteArrayReader* ba_reader =
static_cast<parquet::ByteArrayReader*>(column_reader.get());
// Read all the rows in the column
i = 0;
while (ba_reader->HasNext()) {
parquet::ByteArray value;
// Read one value at a time. The number of rows read is returned. values_read
// contains the number of non-null rows
rows_read =
ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read);
// Ensure only one value is read
assert(rows_read == 1);
// Verify the value written
char expected_value[FIXED_LENGTH] = "parquet";
ARROW_UNUSED(expected_value); // prevent warning in release build
expected_value[7] = static_cast<char>('0' + i / 100);
expected_value[8] = static_cast<char>('0' + (i / 10) % 10);
expected_value[9] = static_cast<char>('0' + i % 10);
if (i % 2 == 0) { // only alternate values exist
// There are no NULL values in the rows written
assert(values_read == 1);
assert(value.len == FIXED_LENGTH);
assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
assert(definition_level == 1);
} else {
// There are NULL values in the rows written
assert(values_read == 0);
assert(definition_level == 0);
}
i++;
}
// Get the Column Reader for the FixedLengthByteArray column
column_reader = row_group_reader->Column(7);
parquet::FixedLenByteArrayReader* flba_reader =
static_cast<parquet::FixedLenByteArrayReader*>(column_reader.get());
// Read all the rows in the column
i = 0;
while (flba_reader->HasNext()) {
parquet::FixedLenByteArray value;
// Read one value at a time. The number of rows read is returned. values_read
// contains the number of non-null rows
rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
// Ensure only one value is read
assert(rows_read == 1);
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
char v = static_cast<char>(i);
char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
i++;
}
}
} catch (const std::exception& e) {
std::cerr << "Parquet read error: " << e.what() << std::endl;
return -1;
}
std::cout << "Parquet Writing and Reading Complete" << std::endl;
return 0;
}

View File

@@ -0,0 +1,71 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <arrow/io/file.h>
#include <arrow/util/logging.h>
#include <parquet/api/reader.h>
#include <parquet/api/writer.h>
using parquet::ConvertedType;
using parquet::Repetition;
using parquet::Type;
using parquet::schema::GroupNode;
using parquet::schema::PrimitiveNode;
constexpr int FIXED_LENGTH = 10;
static std::shared_ptr<GroupNode> SetupSchema() {
parquet::schema::NodeVector fields;
// Create a primitive node named 'boolean_field' with type:BOOLEAN,
// repetition:REQUIRED
fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED,
Type::BOOLEAN, ConvertedType::NONE));
// Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED,
// logical type:TIME_MILLIS
fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32,
ConvertedType::TIME_MILLIS));
// Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED
fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64,
ConvertedType::NONE));
fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96,
ConvertedType::NONE));
fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT,
ConvertedType::NONE));
fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE,
ConvertedType::NONE));
// Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL
fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, Type::BYTE_ARRAY,
ConvertedType::NONE));
// Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY,
// repetition:REQUIRED, field_length = FIXED_LENGTH
fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED,
Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE,
FIXED_LENGTH));
// Create a GroupNode named 'schema' using the primitive nodes defined above
// This GroupNode is the root node of the schema tree
return std::static_pointer_cast<GroupNode>(
GroupNode::Make("schema", Repetition::REQUIRED, fields));
}

6
parquet/parquet/src_copy.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/bin/bash
mkdir src
cat deps.log | while read f; do cp --parents $f src/; done;
mv src/home/mario/Desktop/arrow/cpp/* src/
rm -r src/home/

171
parquet/parquet/src_setup.sh Executable file
View File

@@ -0,0 +1,171 @@
#!/bin/bash
#-----------------------------------------------------------------------------#
# NOTE: before starting to extract the minimal required sources and dependencies
# run
# $ cd cpp/
# $ cmake -D ARROW_PARQUET=ON
# in the arrow repository
# provide
# - local path of clone of https://github.com/apache/arrow.git
# - name/path of main .hpp file of cython extension
repo="$1"
main="$2"
depf="$3"
# check CLI arguments
if [ -z "$repo" ] || [ -z "$main" ] || [ -z "$depf" ]; then
echo "please provide..."
echo "1. local path of arrow repository"
echo "2. name of main .hpp/.cpp"
echo "3. desired name of dependency file"
echo -e "example:\n./setup-sources.sh /home/mario/Desktop/Record_Evolution/arrow/ reader-writer.cc deps.log"
exit 1
fi
echo -e "extracting sources from/for \n1: ${repo}\n2: ${main}\n3: ${depf}\n"
# make sure the dependency file is empty
rm -f ${depf}
touch ${depf}
# define maximal recursion depth
maxdep=8
#-----------------------------------------------------------------------------#
# define function to list dependencies of source file in repository recursively
listDependencies()
{
rep="$1"
src="$2"
dep="$3"
rec="$4"
echo -e "\nstarting 'listDependencies()' for\n1. ${rep}\n2. ${src}\n3. ${dep}\n4. ${rec}"
# generate dependency file (and remove resulting object file)
echo -e "g++ -c -MMD ${src} -I ${rep}cpp/src/\n"
g++ -c -MMD ${src} -I ${rep}cpp/src/
# derive name of dependency and object files
depf=$(basename ${src} | sed 's/.cc/.d/g')
objf=$(basename ${src} | sed 's/.cc/.o/g')
rm ${objf}
# list dependencies by
# 1. removing header
# 2. remove source itself
# 3. delete leading spaces
# 4. delete trailing backslashs
# 5. remove empty lines
cat ${depf} | grep ${rep} | grep -v ${src} | tr -d "^ " | tr -d "\\\\" | awk 'NF' > listdep.log
# rm ${depf}
while IFS= read -r fs
do
echo "$fs"
# check if dependency is already in the list
if grep -Fxq "$fs" "$dep"
then
echo "dep exist"
else
echo "dep does not exist yet => adding it"
# add dependency to list
echo "$fs" >> ${dep}
# check for corresponding source file
fssourc=$(echo ${fs} | sed 's/.h$/.cc/g' | sed 's/.hpp$/.cpp/g')
echo ${fssourc}
if [ -f "$fssourc" ]
then
echo "source file exists"
# list nested dependencies
if [ "$rec" -lt "$maxdep" ]
then
# increment recursion depth
recinc=$(($rec+1))
# call recursion
listDependencies ${rep} ${fssourc} ${dep} ${recinc}
else
echo "maximal recursion depth exceeded"
fi
else
echo "source file does not exist"
fi
fi
echo ""
done < listdep.log
# cat listdep.log | while read fs
# do
# echo $fs
# # check if dependency is already in the list
# inlist=$(cat listdep.log | grep ${fs} | wc -l)
# echo ${inlist}
# # check for any corresponding source files
# # if [ -f ]
# done
}
#-----------------------------------------------------------------------------#
# call function to list dependencies (recursively)
listDependencies ${repo} ${main} ${depf} 0
# # generate dependency file (and remove resulting object file)
# echo -e "generate dependencies:\ng++ -c -MMD ${main} -I ./ -I ${repo}cpp/src/\n"
# g++ -c -MMD ${main} -I ${repo}cpp/src/
# rm $(echo ${main} | sed 's/.cc/.o/g')
#
# # derive name of dependency file
# dep=$(echo ${main} | sed 's/.cc/.d/g')
#
# if [ -f "$dep" ]; then
#
# # list dependencies
# cat ${dep} | sed 's/ /\n/g' | awk 'NF' | grep -v '\\' | grep '\/' > deps.log
#
# # extract list of headers
# cat deps.log | sed ':a;N;$!ba;s/\n/ /g' > deps-headers.log
# echo "list of required headers ($(cat deps.log | wc -l))"
# cat deps-headers.log
# echo ""
#
# # imply list of sources
# cat deps.log | sed 's/.h$/.cc/g' | sed 's/.hpp$/.cpp/g' > sources_raw.log
# cat sources_raw.log | while read f
# do
# if [ -f "$f" ]; then
# echo $f >> sources_check.log
# fi
# done
# cat sources_check.log | sed ':a;N;$!ba;s/\n/ /g' > deps-sources.log
# echo "list of required sources ($(cat sources_check.log | wc -l))"
# cat deps-sources.log
# echo ""
#
# # remove all temporary files
# rm ${dep} deps.log
# rm sources_raw.log sources_check.log
#
# # copy required headers and sources
# echo -e "copy required headers and sources"
# mkdir temp/
# cp --parents `cat deps-headers.log` temp/
# cp --parents `cat deps-sources.log` temp/
# mv temp${repo}cpp/src/* ./
# rm -r temp
#
# # remove dependencies
# #rm deps-headers.log deps-sources.log
#
# # show files
# ls -lh
#
# else
#
# echo -e "\nERROR: failed to generate dependency file\n"
#
# fi

View File

@@ -0,0 +1,26 @@
FROM ubuntu:19.10
RUN apt-get update -y && apt-get install -y \
apt-utils \
git g++ \
make cmake \
pkg-config \
#build-essentials \
python3 \
python3-setuptools \
cython3 \
python3-numpy
RUN git clone https://github.com/apache/arrow.git --single-branch --depth=1
COPY . ./
RUN chmod u+x ./build_arrow_cpp.sh
RUN chmod u+x ./build_arrow_python.sh
RUN ./build_arrow_cpp.sh
RUN ./build_arrow_python.sh
#RUN chmod u+x ./build_arrow.sh
#CMD ["./build_arrow.sh"]
CMD ["sleep 1d"]

View File

@@ -0,0 +1,5 @@
#!/bin/bash
docker build . --tag=pyarrowbuild:latest
docker run -it pyarrowbuild:latest /bin/bash

View File

@@ -0,0 +1,65 @@
#!/bin/bash
sleep infinity
startts=$(date)
echo "starting build process at ${startts}..."
echo -e "\nhome directory is..."
pwd
echo -e "\ncloning apache/arrow..."
git clone https://github.com/apache/arrow.git --single-branch --depth=1
echo -e "\nls -lh /\n"
ls -lh /
echo -e "\nls -lh arrow/\n"
ls -lh arrow/
echo -e "\nls -lh arrow/python/\n"
ls -lh arrow/python
mkdir arrow/cpp/build
pushd arrow/cpp/build
cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-DCMAKE_INSTALL_LIBDIR=lib \
-DARROW_WITH_BZ2=ON \
-DARROW_WITH_ZLIB=ON \
-DARROW_WITH_ZSTD=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_SNAPPY=ON \
-DARROW_WITH_BROTLI=ON \
-DARROW_PARQUET=ON \
-DARROW_PYTHON=ON \
-DARROW_BUILD_TESTS=OFF \
-DARROW_WITH_HDFS=OFF \
..
make -j4
make install
popd
#cython --version
cython3 --version
pushd arrow/python
export ARROW_LIB_DIR=/lib/
export PYARROW_WITH_PARQUET=1
export PYARROW_WITH_CUDA=0
export PYARROW_WITH_FlIGHT=0
export PYARROW_WITH_DATASET=0
export PYARROW_WITH_ORC=0
export PYARROW_WITH_PLASMA=0
export PYARROW_WITH_S3FS=0
export PYARROW_WITH_HDFS=0
export PYARROW_WITH_GANDIVA=0
python3 setup.py build_ext --inplace
popd
echo " started build process at ${startts} ..."
finishts=$(date)
echo "finishing build process at ${finishts}..."

View File

@@ -0,0 +1,23 @@
#!/bin/bash
mkdir arrow/cpp/build
pushd arrow/cpp/build
cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-DCMAKE_INSTALL_LIBDIR=lib \
-DARROW_WITH_BZ2=ON \
-DARROW_WITH_ZLIB=ON \
-DARROW_WITH_ZSTD=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_SNAPPY=ON \
-DARROW_WITH_BROTLI=ON \
-DARROW_PARQUET=ON \
-DARROW_PYTHON=ON \
-DARROW_BUILD_TESTS=OFF \
-DARROW_WITH_HDFS=OFF \
-DARROW_WITH_IPC=OFF \
..
make -j4
make install
popd

View File

@@ -0,0 +1,15 @@
#!/bin/bash
pushd arrow/python
export PYARROW_WITH_PARQUET=1
export PYARROW_WITH_CUDA=0
export PYARROW_WITH_FlIGHT=0
export PYARROW_WITH_DATASET=0
export PYARROW_WITH_ORC=0
export PYARROW_WITH_PLASMA=0
export PYARROW_WITH_S3FS=0
export PYARROW_WITH_HDFS=0
export PYARROW_WITH_GANDIVA=0
# python3 setup.py build_ext --inplace
python3 setup.py install
popd

View File

@@ -0,0 +1,23 @@
build :
docker build . --tag pyarrowbuild
run :
docker run -it pyarrowbuild:latest
run-bash :
docker run -it --volume=$(pwd)/build:/home pyarrowbuild:latest /bin/bash
run-volume :
docker run -it -v /home/pirate/pyarrow/build/:/arrow/python/ pyarrowbuild:latest
#sudo docker run -it --volume=$(pwd)/build:/home ubuntu:latest /bin/bash
rm-container :
cont=$(docker ps -a | tail -n 26 | awk '{print $NF}' | sed ':a;N;$!ba;s/\n/ /g')
echo ${cont}
docker rm ${cont}
rm-image :
img=$(docker image ls --quiet | sed ':a;N;$!ba;s/\n/ /g')
docker image rm ${img}

View File

@@ -0,0 +1,18 @@
import pyarrow.parquet as pq
import pyarrow.csv as pv
csvfile = 'pressureVacuum.csv'
tb = pv.read_csv(csvfile,parse_options=pv.ParseOptions(delimiter=','))
print(tb)
parquetfile = 'pressureVacuum.parquet'
pq.write_table(tb,parquetfile,compression='BROTLI')
# {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD'}
df = pq.read_table(parquetfile,columns=None)
print(df)

8
parquet/pyarrow_arm/sync_pi.sh Executable file
View File

@@ -0,0 +1,8 @@
#!/bin/bash
if [ -z "$1" ]
then
exit 1
fi
scp $1 pirate@mf-pi-40:/home/pirate/pyarrow/