PKG_CPPFLAGS = -I../inst/include -I./llama -I./ggml -I./ggml/ggml-cpu -DUSING_R=1 -DGGML_BUILD_FOR_R
PKG_CXXFLAGS = -DNDEBUG -DGGML_USE_CPU
PKG_CFLAGS = -DNDEBUG -DGGML_USE_CPU

# Cross-platform configuration without OpenMP for stability
GGML_CXXFLAGS = $(PKG_CXXFLAGS) -fPIC -ftree-vectorize
GGML_CFLAGS = $(PKG_CFLAGS) -DUSING_R=1 -fPIC -ftree-vectorize
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)

# Apple Accelerate framework for macOS (vDSP vector operations)
UNAME_S := $(shell uname -s 2>/dev/null)
ifeq ($(UNAME_S),Darwin)
  GGML_CXXFLAGS += -DGGML_USE_ACCELERATE
  GGML_CFLAGS += -DGGML_USE_ACCELERATE
  PKG_LIBS += -framework Accelerate
endif

# Core object files (common to all architectures)
CORE_OBJECTS = bindings.o RcppExports.o \
	ggml/ggml.o ggml/ggml-alloc.o ggml/gguf.o \
	ggml/ggml-backend.o ggml/ggml-backend-reg.o \
	ggml/ggml-quants.o ggml/ggml-threading.o ggml/ggml-opt.o \
	llama/llama-adapter.o llama/llama-arch.o llama/llama-batch.o llama/llama-chat.o \
	llama/llama-context.o llama/llama-cparams.o llama/llama-grammar.o llama/llama-graph.o \
	llama/llama-hparams.o llama/llama-impl.o llama/llama-io.o llama/llama-kv-cache-iswa.o \
	llama/llama-kv-cache.o llama/llama-memory-hybrid.o llama/llama-memory-recurrent.o \
	llama/llama-memory.o llama/llama-mmap.o llama/llama-model-loader.o llama/llama-model-saver.o llama/llama-model.o \
	llama/llama-quant.o llama/llama-sampling.o llama/llama-vocab.o llama/llama.o \
	llama/unicode-data.o llama/unicode.o \
	ggml/ggml-cpu/ggml-cpu-c.o ggml/ggml-cpu/ggml-cpu-cpp.o ggml/ggml-cpu/ops.o \
	ggml/ggml-cpu/binary-ops.o ggml/ggml-cpu/unary-ops.o ggml/ggml-cpu/vec.o \
	ggml/ggml-cpu/traits.o ggml/ggml-cpu/repack.o ggml/ggml-cpu/quants.o \
	simd_info.o

# ============================================================================
# SIMD Optimization Configuration
# ============================================================================
# Architecture detection and SIMD flag selection.
#
# Override at install time by setting the EDGEMODELR_SIMD environment variable:
#   EDGEMODELR_SIMD=AVX2 R CMD INSTALL edgemodelr
#
# Valid values: GENERIC, SSE42, AVX, AVX2, AVX512, NATIVE
# Default (no env var): auto-detect based on architecture
#   - x86_64: SSE4.2 baseline (safe for all x86_64 CPUs since 2008)
#   - aarch64/arm64: NEON (built into ABI, no extra flags needed)
#   - other: generic scalar fallback
# ============================================================================

UNAME_M := $(shell uname -m 2>/dev/null)

ifeq ($(EDGEMODELR_SIMD),GENERIC)
  GGML_CXXFLAGS += -DGGML_CPU_GENERIC
  GGML_CFLAGS += -DGGML_CPU_GENERIC
  ARCH_OBJECTS =
else ifeq ($(EDGEMODELR_SIMD),NATIVE)
  GGML_CXXFLAGS += -march=native
  GGML_CFLAGS += -march=native
  ifeq ($(UNAME_M),x86_64)
    ARCH_OBJECTS = ggml/ggml-cpu/arch/x86/quants.o ggml/ggml-cpu/arch/x86/repack.o ggml/ggml-cpu/arch/x86/cpu-feats.o
  else
    ARCH_OBJECTS =
  endif
else ifeq ($(EDGEMODELR_SIMD),AVX512)
  GGML_CXXFLAGS += -mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx2 -mfma -mf16c -DGGML_AVX512 -DGGML_AVX2 -DGGML_FMA -DGGML_F16C -DGGML_AVX -DGGML_SSE42
  GGML_CFLAGS += -mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx2 -mfma -mf16c -DGGML_AVX512 -DGGML_AVX2 -DGGML_FMA -DGGML_F16C -DGGML_AVX -DGGML_SSE42
  ARCH_OBJECTS = ggml/ggml-cpu/arch/x86/quants.o ggml/ggml-cpu/arch/x86/repack.o ggml/ggml-cpu/arch/x86/cpu-feats.o
else ifeq ($(EDGEMODELR_SIMD),AVX2)
  GGML_CXXFLAGS += -mavx2 -mfma -mf16c -DGGML_AVX2 -DGGML_FMA -DGGML_F16C -DGGML_AVX -DGGML_SSE42
  GGML_CFLAGS += -mavx2 -mfma -mf16c -DGGML_AVX2 -DGGML_FMA -DGGML_F16C -DGGML_AVX -DGGML_SSE42
  ARCH_OBJECTS = ggml/ggml-cpu/arch/x86/quants.o ggml/ggml-cpu/arch/x86/repack.o ggml/ggml-cpu/arch/x86/cpu-feats.o
else ifeq ($(EDGEMODELR_SIMD),AVX)
  GGML_CXXFLAGS += -mavx -mf16c -DGGML_AVX -DGGML_F16C -DGGML_SSE42
  GGML_CFLAGS += -mavx -mf16c -DGGML_AVX -DGGML_F16C -DGGML_SSE42
  ARCH_OBJECTS = ggml/ggml-cpu/arch/x86/quants.o ggml/ggml-cpu/arch/x86/repack.o ggml/ggml-cpu/arch/x86/cpu-feats.o
else ifeq ($(EDGEMODELR_SIMD),SSE42)
  GGML_CXXFLAGS += -msse4.2 -DGGML_SSE42
  GGML_CFLAGS += -msse4.2 -DGGML_SSE42
  ARCH_OBJECTS = ggml/ggml-cpu/arch/x86/quants.o ggml/ggml-cpu/arch/x86/repack.o ggml/ggml-cpu/arch/x86/cpu-feats.o
else
  # Default: portable generic build (no arch-specific flags).
  # CRAN policy prohibits non-portable compiler flags (-msse4.2, -mavx, etc.)
  # in the default install path. Users who want SIMD acceleration should set
  # EDGEMODELR_SIMD=SSE42 (or AVX2/AVX512/NATIVE) at install time.
  GGML_CXXFLAGS += -DGGML_CPU_GENERIC
  GGML_CFLAGS += -DGGML_CPU_GENERIC
  ARCH_OBJECTS =
endif

# Complete objects list with architecture support
OBJECTS = $(CORE_OBJECTS) $(ARCH_OBJECTS)

all: $(SHLIB)

clean:
	rm -f $(OBJECTS) $(SHLIB)

# Special rules for conflicting ggml-cpu files
ggml/ggml-cpu/ggml-cpu-c.o: ggml/ggml-cpu/ggml-cpu.c
	$(CC) $(ALL_CPPFLAGS) $(GGML_CFLAGS) -c $< -o $@

ggml/ggml-cpu/ggml-cpu-cpp.o: ggml/ggml-cpu/ggml-cpu.cpp
	$(CXX) $(ALL_CPPFLAGS) $(GGML_CXXFLAGS) -c $< -o $@

# Standard compilation rules for main source files
bindings.o: bindings.cpp
	$(CXX) $(ALL_CPPFLAGS) $(ALL_CXXFLAGS) -fPIC -c $< -o $@

RcppExports.o: RcppExports.cpp
	$(CXX) $(ALL_CPPFLAGS) $(ALL_CXXFLAGS) -fPIC -c $< -o $@

# Special rules for GGML/LLAMA files
ggml/%.o: ggml/%.c
	$(CC) $(ALL_CPPFLAGS) $(GGML_CFLAGS) -c $< -o $@

ggml/%.o: ggml/%.cpp
	$(CXX) $(ALL_CPPFLAGS) $(GGML_CXXFLAGS) -c $< -o $@

llama/%.o: llama/%.cpp
	$(CXX) $(ALL_CPPFLAGS) $(GGML_CXXFLAGS) -c $< -o $@

# SIMD info reporting (compiled with GGML flags to detect SIMD features)
simd_info.o: simd_info.cpp
	$(CXX) $(ALL_CPPFLAGS) $(GGML_CXXFLAGS) -c $< -o $@

# Architecture-specific rules for x86
ggml/ggml-cpu/arch/x86/%.o: ggml/ggml-cpu/arch/x86/%.c
	$(CC) $(ALL_CPPFLAGS) $(GGML_CFLAGS) -c $< -o $@

ggml/ggml-cpu/arch/x86/%.o: ggml/ggml-cpu/arch/x86/%.cpp
	$(CXX) $(ALL_CPPFLAGS) $(GGML_CXXFLAGS) -c $< -o $@

# Standard fallback rules
%.o: %.c
	$(CC) $(ALL_CPPFLAGS) $(ALL_CFLAGS) -fPIC -c $< -o $@

%.o: %.cpp
	$(CXX) $(ALL_CPPFLAGS) $(ALL_CXXFLAGS) -fPIC -c $< -o $@
