Formal Power Series Composition
(fps_composition.hpp)

View this file on GitHub
Last update: 2025-01-19 15:28:01+08:00
Include: #include "fps_composition.hpp"
Link: https://arxiv.org/abs/2404.05177
Link: https://noshi91.hatenablog.com/entry/2024/03/16/224034

Composition of formal power series

We are given $f\in\mathbb{C}\left\lbrack\left\lbrack x\right\rbrack\right\rbrack,g\in x\mathbb{C}\left\lbrack\left\lbrack x\right\rbrack\right\rbrack$, and we want to compute $f(g)\bmod{x^n}$.

It is possible to compute $f(g)$ if $f,g\in\mathbb{C}\left\lbrack x\right\rbrack$, since each coefficient of $f(g)$ is a sum of finitely many terms. But if $f,g\in\mathbb{C}\left\lbrack\left\lbrack x\right\rbrack\right\rbrack$ then $g(0)=0$ is required.

Kinoshita–Li’s algorithm

Let $f(x):=\sum _ {j = 0}^{n - 1}f _ j x^j$, we have

\[f(g)=\sum _ {j \geq 0}f _ j g(x)^j\]

Consider the bivariate formal Laurent series in $x,y$

\[\begin{aligned} f(g) &= \left\lbrack y^0\right\rbrack\frac{f\left(y^{-1}\right)}{1 - y\cdot g(x)} \in \mathbb{C}\left\lbrack\left\lbrack x\right\rbrack\right\rbrack\left(\left(y\right)\right) \\ &= \sum _ {j\geq 0}\left(\left(\cdots + f _ j y^{-j} + \cdots \right)y^j\cdot g(x)^j\right) \end{aligned}\]

since we only need to compute $f(g)\bmod{x^n}$, it’s okay to treat $g(x)$ as a polynomial. Unlike Kinoshita and Li’s paper, I don’t want to manipulate in formal power series ring by multiplying $\frac{f\left(y^{-1}\right)}{1 - y\cdot g(x)}$ by $y^{n-1}$. If we did so, we may care about the parity of $n-1$.

Kinoshita–Li’s algorithm is actually a bivariate variant of Bostan–Mori’s algorithm, we have

\[\begin{aligned} \frac{P\left(y\right)}{Q\left(x,y\right)}\bmod{x^n}&=\left(\frac{P\left(y\right)}{Q\left(x,y\right)Q\left(-x,y\right)}\bmod{x^n}\right)Q\left(-x,y\right)\bmod{x^n} \\ &=\left(\frac{P(y)}{V\left(x^2,y\right)}\bmod{x^n}\right)Q\left(-x,y\right)\bmod{x^n} \\ &=\left.\left(\frac{P(y)}{V(z,y)}\bmod{z^{\left\lceil n/2\right\rceil}}\right)\right|_{z=x^2}Q\left(-x,y\right)\bmod{x^n} \end{aligned}\]

where $V\left(x^2,y\right)=Q(x,y)Q(-x,y)$. We can solve the problem when $n=1$, since now we only need to compute $\frac{P(y)}{Q(x,y)}\bmod{x}=\frac{P(y)}{Q(0,y)}\in\mathbb{C}\left(\left( y\right)\right)$. Actually we do not need to compute $\frac{1}{Q(0,y)}$, since it’s just $\left(1-g(0)y\right)^{-k}\in\mathbb{C}\left\lbrack\left\lbrack y\right\rbrack\right\rbrack$ for a certain $k\in\mathbb{N}$, the coefficients of which are known as the binomial coefficients multiplied by some powers of $g(0)$. (noshi91 told me on twitter, I did’t notice that.)

\[\begin{array}{ll} &\textbf{Algorithm }\operatorname{\mathsf{KinoshitaLi}}\text{:} \\ &\textbf{Input}\text{: }P\in\mathbb{C}\left\lbrack y\right\rbrack,Q\in\mathbb{C}\left\lbrack x,y\right\rbrack ,n,m\in\mathbb{N}_{>0}\text{.} \\ &\textbf{Output}\text{: }\left\lbrack y^{\left(-m,0\right\rbrack}\right\rbrack\dfrac{P\left(y^{-1}\right)}{Q(x,y)}\bmod{x^n}\text{.} \\ &\textbf{Require}\text{: }\left\lbrack x^0y^0\right\rbrack Q=1\text{.} \\ 1&\textbf{if }n=1\textbf{ then return }\begin{bmatrix}\left\lbrack y^{-m+1}\right\rbrack\frac{P\left(y^{-1}\right)}{Q(0,y)} & \cdots & \left\lbrack y^0\right\rbrack\frac{P\left(y^{-1}\right)}{Q(0,y)}\end{bmatrix} \\ 2&V(x^2,y)\gets Q(x,y)Q(-x,y)\bmod{x^n}\bmod{y^{1 + \deg _ y P}} \\ 3&d \gets \deg _ y Q\left(-x,y\right) \\ 4&m' \gets \min \left\lbrace m+d, 1 + \deg _ y P\right\rbrace \\ 5&\begin{bmatrix} t _ {- m' + 1} & \cdots & t _ 0\end{bmatrix}\gets \operatorname{\mathsf{KinoshitaLi}}\left(P(y),V(x,y),\left\lceil n/2\right\rceil,m'\right) \\ 6&T(x,y) \gets \sum_{j=- m' + 1}^0 t _ jy^j \\ 7&U(x,y) \gets T(x^2,y)Q(-x,y)\bmod{x^n} \\ 8&\textbf{return }\begin{bmatrix}\left\lbrack y^{-m+1}\right\rbrack U(x,y) & \cdots & \left\lbrack y^0\right\rbrack U(x,y)\end{bmatrix} \end{array}\]

Kinoshita–Li’s algorithm is simple and elegant.

TODO: optimization of this algorithm.

Sample Code

I have written a simplified version of Kinoshita–Li’s algorithm which could be found at 形式幂级数复合丨复合逆 - OI Wiki. If we remove the redundant memory movements/copies, we can obtain the following faster but hard-to-explain code.

// CXXFLAGS=-std=c++17 -Wall -Wextra
#include <algorithm>
#include <cassert>
#include <cstring>
#include <tuple>
#include <utility>
#include <vector>

using uint         = unsigned;
using ull          = unsigned long long;
constexpr uint MOD = 998244353;

constexpr uint PowMod(uint a, ull e) {
    for (uint res = 1;; a = (ull)a * a % MOD) {
        if (e & 1) res = (ull)res * a % MOD;
        if ((e /= 2) == 0) return res;
    }
}

constexpr uint InvMod(uint a) { return PowMod(a, MOD - 2); }

constexpr uint QUAD_NONRESIDUE = 3;
constexpr int LOG2_ORD         = __builtin_ctz(MOD - 1);
constexpr uint ZETA            = PowMod(QUAD_NONRESIDUE, (MOD - 1) >> LOG2_ORD);
constexpr uint INV_ZETA        = InvMod(ZETA);

std::pair<std::vector<uint>, std::vector<uint>> GetFFTRoot(int n) {
    assert((n & (n - 1)) == 0);
    if (n / 2 == 0) return {};
    std::vector<uint> root(n / 2), inv_root(n / 2);
    root[0] = inv_root[0] = 1;
    for (int i = 0; (1 << i) < n / 2; ++i)
        root[1 << i]               = PowMod(ZETA, 1LL << (LOG2_ORD - i - 2)),
                  inv_root[1 << i] = PowMod(INV_ZETA, 1LL << (LOG2_ORD - i - 2));
    for (int i = 1; i < n / 2; ++i)
        root[i]     = (ull)root[i - (i & (i - 1))] * root[i & (i - 1)] % MOD,
        inv_root[i] = (ull)inv_root[i - (i & (i - 1))] * inv_root[i & (i - 1)] % MOD;
    return {root, inv_root};
}

void Butterfly(uint a[], int n, const uint root[]) {
    assert((n & (n - 1)) == 0);
    for (int i = n; i >= 2; i /= 2)
        for (int j = 0; j < n; j += i)
            for (int k = j; k < j + i / 2; ++k) {
                const uint u = a[k];
                a[k + i / 2] = (ull)a[k + i / 2] * root[j / i] % MOD;
                if ((a[k] += a[k + i / 2]) >= MOD) a[k] -= MOD;
                if ((a[k + i / 2] = u + MOD - a[k + i / 2]) >= MOD) a[k + i / 2] -= MOD;
            }
}

void InvButterfly(uint a[], int n, const uint root[]) {
    assert((n & (n - 1)) == 0);
    for (int i = 2; i <= n; i *= 2)
        for (int j = 0; j < n; j += i)
            for (int k = j; k < j + i / 2; ++k) {
                const uint u = a[k];
                if ((a[k] += a[k + i / 2]) >= MOD) a[k] -= MOD;
                a[k + i / 2] = (ull)(u + MOD - a[k + i / 2]) * root[j / i] % MOD;
            }
}

int GetFFTSize(int n) {
    int len = 1;
    while (len < n) len *= 2;
    return len;
}

void FFT(uint a[], int n, const uint root[]) { Butterfly(a, n, root); }

void InvFFT(uint a[], int n, const uint root[]) {
    InvButterfly(a, n, root);
    const uint invn = InvMod(n);
    for (int i = 0; i < n; ++i) a[i] = (ull)a[i] * invn % MOD;
}

std::vector<uint> FPSComp(std::vector<uint> f, std::vector<uint> g, int n) {
    assert(empty(g) || g[0] == 0);
    const int len = GetFFTSize(n);
    std::vector<uint> root, inv_root;
    tie(root, inv_root) = GetFFTRoot(len * 4);
    // [y^(-1)] (f(y) / (-g(x) + y)) mod x^n in R[x]((y^(-1)))
    const auto KinoshitaLi = [&](auto &&KinoshitaLi, std::vector<uint> &P, std::vector<uint> Q,
                                 int d, int n) {
        assert((int)size(P) == d * n * 2);
        assert((int)size(Q) == d * n * 2);
        if (n == 1) return;
        Q.resize(d * n * 4);
        Q[d * n * 2] = 1;
        FFT(data(Q), d * n * 4, data(root));
        std::vector<uint> V(d * n * 2);
        for (int i = 0; i < d * n * 4; i += 2) V[i / 2] = (ull)Q[i] * Q[i + 1] % MOD;
        InvFFT(data(V), d * n * 2, data(inv_root));
        assert(V[0] == 1);
        V[0] = 0;
        for (int i = 0; i < d * 2; ++i)
            std::memset(data(V) + i * n + n / 2, 0, sizeof(uint) * (n / 2));
        KinoshitaLi(KinoshitaLi, P, std::move(V), d * 2, n / 2);
        FFT(data(P), d * n * 2, data(root));
        for (int i = 0; i < d * n * 4; i += 2) {
            const uint u = Q[i];
            Q[i]         = (ull)P[i / 2] * Q[i + 1] % MOD;
            Q[i + 1]     = (ull)P[i / 2] * u % MOD;
        }
        InvFFT(data(Q), d * n * 4, data(inv_root));
        for (int i = 0; i < d; ++i) {
            uint *const u = data(P) + i * n * 2;
            std::memcpy(u, data(Q) + (i + d) * (n * 2), sizeof(uint) * n);
            std::memset(u + n, 0, sizeof(uint) * n);
        }
    };
    f.resize(len * 2);
    g.resize(len * 2);
    for (int i = len - 1; i >= 0; --i) f[i * 2] = f[i], f[i * 2 + 1] = 0;
    for (int i = 0; i < len; ++i) g[i] = (g[i] != 0 ? MOD - g[i] : 0);
    std::memset(data(g) + len, 0, sizeof(uint) * len);
    KinoshitaLi(KinoshitaLi, f, std::move(g), 1, len);
    f.resize(n);
    return f;
}

// Power Projection: [x^(n-1)] (fg^i) for i=0,..,n-1
std::vector<uint> PowProj(std::vector<uint> f, std::vector<uint> g, int n) {
    assert(empty(g) || g[0] == 0);
    const int len = GetFFTSize(n);
    std::vector<uint> root, inv_root;
    tie(root, inv_root) = GetFFTRoot(len * 4);
    // [x^(n-1)] (f(x) / (-g(x) + y)) in R[x]((y^(-1)))
    const auto KinoshitaLi = [&](std::vector<uint> &P, std::vector<uint> &Q, int d, int n) {
        assert((int)size(P) == d * n * 2);
        assert((int)size(Q) == d * n * 2);
        P.insert(begin(P), d * n * 2, 0u);
        Q.resize(d * n * 4);
        std::vector<uint> nextP(d * n * 4);
        for (; n > 1; d *= 2, n /= 2) {
            Q[d * n * 2] = 1;
            FFT(data(P), d * n * 4, data(inv_root));
            FFT(data(Q), d * n * 4, data(root));
            uint *const nP = data(nextP) + d * n * 2;
            for (int i = 0; i < d * n * 4; i += 2) {
                if ((nP[i / 2] = ((ull)P[i] * Q[i + 1] + (ull)P[i + 1] * Q[i]) % MOD) & 1)
                    nP[i / 2] += MOD;
                nP[i / 2] /= 2;
                Q[i / 2] = (ull)Q[i] * Q[i + 1] % MOD;
            }
            InvFFT(nP, d * n * 2, data(root));
            InvFFT(data(Q), d * n * 2, data(inv_root));
            assert(Q[0] == 1);
            Q[0] = 0;
            for (int i = 0; i < d * 2; ++i) {
                std::memset(nP + i * n, 0, sizeof(uint) * (n / 2));
                std::memset(data(Q) + i * n + n / 2, 0, sizeof(uint) * (n / 2));
            }
            P.swap(nextP);
            std::memset(data(P), 0, sizeof(uint) * (d * n * 2));
            std::memset(data(Q) + d * n * 2, 0, sizeof(uint) * (d * n * 2));
        }
        P.erase(begin(P), begin(P) + d * n * 2);
    };
    f.insert(begin(f), len - n, 0);
    f.resize(len);
    reverse(begin(f), end(f));
    f.insert(begin(f), len, 0u);
    g.resize(len * 2);
    for (int i = 0; i < len; ++i) g[i] = (g[i] != 0 ? MOD - g[i] : 0);
    std::memset(data(g) + len, 0, sizeof(uint) * len);
    KinoshitaLi(f, g, 1, len);
    for (int i = 0; i < len; ++i) f[i] = f[i * 2 + 1];
    f.resize(n);
    return f;
}

std::vector<uint> FPSPow1(std::vector<uint> g, uint e, int n) {
    assert(!empty(g) && g[0] == 1);
    if (n == 1) return std::vector<uint>{1u};
    std::vector<uint> inv(n), f(n);
    inv[1] = f[0] = 1;
    for (int i = 2; i < n; ++i) inv[i] = (ull)(MOD - MOD / i) * inv[MOD % i] % MOD;
    for (int i = 1; i < n; ++i) f[i] = (ull)f[i - 1] * (e + MOD + 1 - i) % MOD * inv[i] % MOD;
    g[0] = 0;
    return FPSComp(f, g, n);
}

std::vector<uint> FPSRev(std::vector<uint> f, int n) {
    assert(size(f) >= 2 && f[0] == 0 && f[1] != 0);
    if (n == 1) return std::vector<uint>{0u};
    f.resize(n);
    const uint invf1 = InvMod(f[1]);
    uint invf1p      = 1;
    for (int i = 0; i < n; ++i) f[i] = (ull)f[i] * invf1p % MOD, invf1p = (ull)invf1p * invf1 % MOD;
    std::vector<uint> inv(n);
    inv[1] = 1;
    for (int i = 2; i < n; ++i) inv[i] = (ull)(MOD - MOD / i) * inv[MOD % i] % MOD;
    auto proj = PowProj(std::vector<uint>{1u}, f, n);
    for (int i = 1; i < n; ++i) proj[i] = (ull)proj[i] * (n - 1) % MOD * inv[i] % MOD;
    reverse(begin(proj), end(proj));
    auto res = FPSPow1(proj, InvMod(MOD + 1 - n), n - 1);
    for (int i = 0; i < n - 1; ++i) res[i] = (ull)res[i] * invf1 % MOD;
    res.insert(begin(res), 0);
    return res;
}

std::pair<std::vector<uint>, std::vector<uint>> GetFactorial(int n) {
    if (n == 0) return {};
    std::vector<uint> factorial(n), inv_factorial(n);
    factorial[0] = inv_factorial[0] = 1;
    if (n == 1) return {factorial, inv_factorial};
    std::vector<uint> inv(n);
    inv[1] = 1;
    for (int i = 2; i < n; ++i) inv[i] = (ull)(MOD - MOD / i) * inv[MOD % i] % MOD;
    for (int i = 1; i < n; ++i)
        factorial[i]     = (ull)factorial[i - 1] * i % MOD,
        inv_factorial[i] = (ull)inv_factorial[i - 1] * inv[i] % MOD;
    return {factorial, inv_factorial};
}

// f(x) |-> f(x + c)
std::vector<uint> TaylorShift(std::vector<uint> f, uint c) {
    if (empty(f) || c == 0) return f;
    const int n                           = size(f);
    const auto [factorial, inv_factorial] = GetFactorial(n);
    for (int i = 0; i < n; ++i) f[i] = (ull)f[i] * factorial[i] % MOD;
    std::vector<uint> g(n);
    uint cp = 1;
    for (int i = 0; i < n; ++i) g[i] = (ull)cp * inv_factorial[i] % MOD, cp = (ull)cp * c % MOD;
    const int len               = GetFFTSize(n * 2 - 1);
    const auto [root, inv_root] = GetFFTRoot(len);
    f.resize(len);
    g.resize(len);
    FFT(data(f), len, data(inv_root));
    FFT(data(g), len, data(root));
    for (int i = 0; i < len; ++i) f[i] = (ull)f[i] * g[i] % MOD;
    InvFFT(data(f), len, data(root));
    f.resize(n);
    for (int i = 0; i < n; ++i) f[i] = (ull)f[i] * inv_factorial[i] % MOD;
    return f;
}

std::vector<uint> PolyComp(const std::vector<uint> &f, const std::vector<uint> &g, int n) {
    if (empty(g) || g[0] == 0) return FPSComp(f, g, n);
    auto gg = g;
    gg[0]   = 0;
    return FPSComp(TaylorShift(f, g[0]), std::move(gg), n);
}

References

Yasunori Kinoshita, Baitian Li. Power Series Composition in Near-Linear Time. FOCS 2024: 2180-2185 url: https://arxiv.org/abs/2404.05177
Alin Bostan, Ryuhei Mori. A Simple and Fast Algorithm for Computing the N-th Term of a Linearly Recurrent Sequence. SOSA 2021: 118-132 url: https://arxiv.org/abs/2008.08822
R. P. Brent and H. T. Kung. 1978. Fast Algorithms for Manipulating Formal Power Series. J. ACM 25, 4 (Oct. 1978), 581–595. url: https://doi.org/10.1145/322092.322099
David Harvey. Faster polynomial multiplication via multipoint Kronecker substitution. J. Symb. Comput. 44(10): 1502-1510 (2009) url: https://doi.org/10.1016/j.jsc.2009.05.004
Alin Bostan, Bruno Salvy, Éric Schost. Power series composition and change of basis. ISSAC 2008: 269-276 url: https://arxiv.org/abs/0804.2337

Depends on

Required by

famous_sequence.hpp

Verified with

Code

#pragma once

#include "binomial.hpp"
#include "fft.hpp"
#include "fps_basic.hpp"
#include <algorithm>
#include <cassert>
#include <utility>
#include <vector>

// returns f(g) mod x^n
// see:
// [1]: Yasunori Kinoshita, Baitian Li. Power Series Composition in Near-Linear Time.
//      https://arxiv.org/abs/2404.05177
template<typename Tp>
inline std::vector<Tp> composition(const std::vector<Tp> &f, const std::vector<Tp> &g, int n) {
    if (n <= 0) return {};
    if (g.empty()) {
        std::vector<Tp> res(n);
        if (!f.empty()) res[0] = f[0];
        return res;
    }

    // [y^(-1)] (f(y) / (-g(x) + y)) mod x^n in R[x]((y^(-1)))
    auto rec = [g0 = g[0]](auto &&rec, const std::vector<Tp> &P, const std::vector<Tp> &Q, int d,
                           int n) {
        if (n == 1) {
            std::vector<Tp> invQ(d + 1);
            auto &&bin = Binomial<Tp>::get(d * 2);
            Tp gg      = 1;
            for (int i = 0; i <= d; ++i) invQ[d - i] = bin.binom(d + i - 1, d - 1) * gg, gg *= g0;
            // invQ[i] = [y^(-2d + i)]Q^(-1)
            // P[0,d-1] * invQ[-2d,-d] => [0,d-1] * [0,d]
            // take [-d,-1] => take [d,2d-1]
            auto PinvQ = convolution(P, invQ);
            PinvQ.erase(PinvQ.begin(), PinvQ.begin() + d);
            PinvQ.resize(d);
            return PinvQ;
        }

        std::vector<Tp> dftQ(d * n * 4);
        for (int i = 0; i < d; ++i)
            for (int j = 0; j < n; ++j) dftQ[i * (n * 2) + j] = Q[i * n + j];
        dftQ[d * n * 2] = 1;
        fft(dftQ);
        std::vector<Tp> V(d * n * 2);
        for (int i = 0; i < d * n * 4; i += 2) V[i / 2] = dftQ[i] * dftQ[i + 1];
        inv_fft(V);
        V[0] -= 1;

        for (int i = 1; i < d * 2; ++i)
            for (int j = 0; j < n / 2; ++j) V[i * (n / 2) + j] = V[i * n + j];
        V.resize(d * n);

        const auto T = rec(rec, P, std::move(V), d * 2, n / 2);

        std::vector<Tp> dftT(d * n * 2);
        for (int i = 0; i < d * 2; ++i)
            for (int j = 0; j < n / 2; ++j) dftT[i * n + j] = T[i * (n / 2) + j];
        fft(dftT);

        std::vector<Tp> U(d * n * 4);
        for (int i = 0; i < d * n * 4; i += 2) {
            U[i]     = dftT[i / 2] * dftQ[i + 1];
            U[i + 1] = dftT[i / 2] * dftQ[i];
        }
        inv_fft(U);

        // [-2d,d-1] => [0,3d-1]
        // take [-d,-1] => take [d,2d-1]
        for (int i = 0; i < d; ++i)
            for (int j = 0; j < n; ++j) U[i * n + j] = U[(i + d) * (n * 2) + j];
        U.resize(d * n);
        return U;
    };

    const int k = fft_len(std::max<int>(n, f.size()));
    std::vector<Tp> Q(k);
    for (int i = 0; i < std::min<int>(k, g.size()); ++i) Q[i] = -g[i];

    auto res = rec(rec, f, Q, 1, k);
    res.resize(n);
    return res;
}

// returns [x^k]gf^0, [x^k]gf, ..., [x^k]gf^(n-1)
// see:
// [1]: noshi91. FPS の合成と逆関数、冪乗の係数列挙 Θ(n (log(n))^2)
//      https://noshi91.hatenablog.com/entry/2024/03/16/224034
template<typename Tp> inline std::vector<Tp>
enum_kth_term_of_power(const std::vector<Tp> &f, const std::vector<Tp> &g, int k, int n) {
    if (k < 0 || n <= 0) return {};
    if (f.empty()) {
        std::vector<Tp> res(n);
        if (k < (int)g.size()) res[0] = g[k];
        return res;
    }

    // [x^k] (g(x) / (-f(x) + y)) in R[x]((y^(-1)))
    std::vector<Tp> P(g), Q(k + 1);
    P.resize(k + 1);
    for (int i = 0; i < std::min<int>(k + 1, f.size()); ++i) Q[i] = -f[i];

    int d = 1;
    for (; k; d *= 2, k /= 2) {
        const int len = fft_len((d * 2) * ((k + 1) * 2) - 1);
        std::vector<Tp> dftP(len), dftQ(len);
        for (int i = 0; i < d; ++i)
            for (int j = 0; j <= k; ++j) {
                dftP[i * ((k + 1) * 2) + j] = P[i * (k + 1) + j];
                dftQ[i * ((k + 1) * 2) + j] = Q[i * (k + 1) + j];
            }
        dftQ[d * (k + 1) * 2] = 1;
        fft(dftP);
        fft(dftQ);

        P.resize(len / 2);
        Q.resize(len / 2);
        if (k & 1) {
            auto &&root = FftInfo<Tp>::get().inv_root(len / 2);
            for (int i = 0; i < len; i += 2) {
                P[i / 2] = (dftP[i] * dftQ[i + 1] - dftP[i + 1] * dftQ[i]).div_by_2() * root[i / 2];
                Q[i / 2] = dftQ[i] * dftQ[i + 1];
            }
        } else {
            for (int i = 0; i < len; i += 2) {
                P[i / 2] = (dftP[i] * dftQ[i + 1] + dftP[i + 1] * dftQ[i]).div_by_2();
                Q[i / 2] = dftQ[i] * dftQ[i + 1];
            }
        }
        inv_fft(P);
        inv_fft(Q);
        if (d * (k + 1) * 4 >= len) Q[(d * (k + 1) * 4) % len] -= 1;

        for (int i = 1; i < d * 2; ++i)
            for (int j = 0; j <= k / 2; ++j) {
                P[i * (k / 2 + 1) + j] = P[i * (k + 1) + j];
                Q[i * (k / 2 + 1) + j] = Q[i * (k + 1) + j];
            }
        P.resize(d * 2 * (k / 2 + 1));
        Q.resize(d * 2 * (k / 2 + 1));
    }

    std::vector<Tp> invQ(n + 1);
    auto &&bin = Binomial<Tp>::get(d + n);
    Tp ff      = 1;
    for (int i = 0; i <= n; ++i) invQ[n - i] = bin.binom(d + i - 1, d - 1) * ff, ff *= f[0];
    // invQ[i] = [y^(-2d + i)]Q^(-1)
    // P[0,d-1] * invQ[-(d+n),-d] => [0,d-1] * [0,n]
    auto PinvQ = convolution(P, invQ);
    // take [-n,-1] => take [d,d+n-1]
    PinvQ.erase(PinvQ.begin(), PinvQ.begin() + d);
    PinvQ.resize(n);
    // output => [-1,-n] reverse
    // before I just reverse it and mistaken something.
    std::reverse(PinvQ.begin(), PinvQ.end());
    return PinvQ;
}

// returns g s.t. f(g) = g(f) = x mod x^n
template<typename Tp> inline std::vector<Tp> reversion(std::vector<Tp> f, int n) {
    if (n <= 0 || f.size() < 2) return {};
    assert(order(f) == 1);
    const auto if1 = f[1].inv();
    if (n == 1) return {Tp(0)};
    f.resize(n);
    Tp ff = 1;
    for (int i = 1; i < n; ++i) f[i] *= ff *= if1;
    auto a     = enum_kth_term_of_power(f, {Tp(1)}, n - 1, n);
    auto &&bin = Binomial<Tp>::get(n);
    for (int i = 1; i < n; ++i) a[i] *= (n - 1) * bin.inv(i);
    auto b = fps_pow(std::vector(a.rbegin(), a.rend() - 1), Tp(1 - n).inv().val(), n - 1);
    for (int i = 0; i < n - 1; ++i) b[i] *= if1;
    b.insert(b.begin(), Tp(0));
    return b;
}

#line 2 "fps_composition.hpp"

#line 2 "binomial.hpp"

#include <algorithm>
#include <vector>

template<typename Tp> class Binomial {
    std::vector<Tp> factorial_, invfactorial_;

    Binomial() : factorial_{Tp(1)}, invfactorial_{Tp(1)} {}

    void preprocess(int n) {
        if (const int nn = factorial_.size(); nn < n) {
            int k = nn;
            while (k < n) k *= 2;
            k = std::min<long long>(k, Tp::mod());
            factorial_.resize(k);
            invfactorial_.resize(k);
            for (int i = nn; i < k; ++i) factorial_[i] = factorial_[i - 1] * i;
            invfactorial_.back() = factorial_.back().inv();
            for (int i = k - 2; i >= nn; --i) invfactorial_[i] = invfactorial_[i + 1] * (i + 1);
        }
    }

public:
    static const Binomial &get(int n) {
        static Binomial bin;
        bin.preprocess(n);
        return bin;
    }

    Tp binom(int n, int m) const {
        return n < m ? Tp() : factorial_[n] * invfactorial_[m] * invfactorial_[n - m];
    }
    Tp inv(int n) const { return factorial_[n - 1] * invfactorial_[n]; }
    Tp factorial(int n) const { return factorial_[n]; }
    Tp inv_factorial(int n) const { return invfactorial_[n]; }
};
#line 2 "fft.hpp"

#line 4 "fft.hpp"
#include <cassert>
#include <iterator>
#include <memory>
#line 8 "fft.hpp"

template<typename Tp> class FftInfo {
    static Tp least_quadratic_nonresidue() {
        for (int i = 2;; ++i)
            if (Tp(i).pow((Tp::mod() - 1) / 2) == -1) return Tp(i);
    }

    const int ordlog2_;
    const Tp zeta_;
    const Tp invzeta_;
    const Tp imag_;
    const Tp invimag_;

    mutable std::vector<Tp> root_;
    mutable std::vector<Tp> invroot_;

    FftInfo()
        : ordlog2_(__builtin_ctzll(Tp::mod() - 1)),
          zeta_(least_quadratic_nonresidue().pow((Tp::mod() - 1) >> ordlog2_)),
          invzeta_(zeta_.inv()), imag_(zeta_.pow(1LL << (ordlog2_ - 2))), invimag_(-imag_),
          root_{Tp(1), imag_}, invroot_{Tp(1), invimag_} {}

public:
    static const FftInfo &get() {
        static FftInfo info;
        return info;
    }

    Tp imag() const { return imag_; }
    Tp inv_imag() const { return invimag_; }
    Tp zeta() const { return zeta_; }
    Tp inv_zeta() const { return invzeta_; }
    const std::vector<Tp> &root(int n) const {
        // [0, n)
        assert((n & (n - 1)) == 0);
        if (const int s = root_.size(); s < n) {
            root_.resize(n);
            for (int i = __builtin_ctz(s); (1 << i) < n; ++i) {
                const int j = 1 << i;
                root_[j]    = zeta_.pow(1LL << (ordlog2_ - i - 2));
                for (int k = j + 1; k < j * 2; ++k) root_[k] = root_[k - j] * root_[j];
            }
        }
        return root_;
    }
    const std::vector<Tp> &inv_root(int n) const {
        // [0, n)
        assert((n & (n - 1)) == 0);
        if (const int s = invroot_.size(); s < n) {
            invroot_.resize(n);
            for (int i = __builtin_ctz(s); (1 << i) < n; ++i) {
                const int j = 1 << i;
                invroot_[j] = invzeta_.pow(1LL << (ordlog2_ - i - 2));
                for (int k = j + 1; k < j * 2; ++k) invroot_[k] = invroot_[k - j] * invroot_[j];
            }
        }
        return invroot_;
    }
};

inline int fft_len(int n) {
    --n;
    n |= n >> 1, n |= n >> 2, n |= n >> 4, n |= n >> 8;
    return (n | n >> 16) + 1;
}

namespace detail {

template<typename Iterator> inline void
butterfly_n(Iterator a, int n,
            const std::vector<typename std::iterator_traits<Iterator>::value_type> &root) {
    assert(n > 0);
    assert((n & (n - 1)) == 0);
    const int bn = __builtin_ctz(n);
    if (bn & 1) {
        for (int i = 0; i < n / 2; ++i) {
            const auto a0 = a[i], a1 = a[i + n / 2];
            a[i] = a0 + a1, a[i + n / 2] = a0 - a1;
        }
    }
    for (int i = n >> (bn & 1); i >= 4; i /= 4) {
        const int i4 = i / 4;
        for (int k = 0; k < i4; ++k) {
            const auto a0 = a[k + i4 * 0], a1 = a[k + i4 * 1];
            const auto a2 = a[k + i4 * 2], a3 = a[k + i4 * 3];
            const auto a02p = a0 + a2, a02m = a0 - a2;
            const auto a13p = a1 + a3, a13m = (a1 - a3) * root[1];
            a[k + i4 * 0] = a02p + a13p, a[k + i4 * 1] = a02p - a13p;
            a[k + i4 * 2] = a02m + a13m, a[k + i4 * 3] = a02m - a13m;
        }
        for (int j = i, m = 2; j < n; j += i, m += 2) {
            const auto r = root[m], r2 = r * r, r3 = r2 * r;
            for (int k = j; k < j + i4; ++k) {
                const auto a0 = a[k + i4 * 0], a1 = a[k + i4 * 1] * r;
                const auto a2 = a[k + i4 * 2] * r2, a3 = a[k + i4 * 3] * r3;
                const auto a02p = a0 + a2, a02m = a0 - a2;
                const auto a13p = a1 + a3, a13m = (a1 - a3) * root[1];
                a[k + i4 * 0] = a02p + a13p, a[k + i4 * 1] = a02p - a13p;
                a[k + i4 * 2] = a02m + a13m, a[k + i4 * 3] = a02m - a13m;
            }
        }
    }
}

template<typename Iterator> inline void
inv_butterfly_n(Iterator a, int n,
                const std::vector<typename std::iterator_traits<Iterator>::value_type> &root) {
    assert(n > 0);
    assert((n & (n - 1)) == 0);
    const int bn = __builtin_ctz(n);
    for (int i = 4; i <= (n >> (bn & 1)); i *= 4) {
        const int i4 = i / 4;
        for (int k = 0; k < i4; ++k) {
            const auto a0 = a[k + i4 * 0], a1 = a[k + i4 * 1];
            const auto a2 = a[k + i4 * 2], a3 = a[k + i4 * 3];
            const auto a01p = a0 + a1, a01m = a0 - a1;
            const auto a23p = a2 + a3, a23m = (a2 - a3) * root[1];
            a[k + i4 * 0] = a01p + a23p, a[k + i4 * 1] = a01m + a23m;
            a[k + i4 * 2] = a01p - a23p, a[k + i4 * 3] = a01m - a23m;
        }
        for (int j = i, m = 2; j < n; j += i, m += 2) {
            const auto r = root[m], r2 = r * r, r3 = r2 * r;
            for (int k = j; k < j + i4; ++k) {
                const auto a0 = a[k + i4 * 0], a1 = a[k + i4 * 1];
                const auto a2 = a[k + i4 * 2], a3 = a[k + i4 * 3];
                const auto a01p = a0 + a1, a01m = a0 - a1;
                const auto a23p = a2 + a3, a23m = (a2 - a3) * root[1];
                a[k + i4 * 0] = a01p + a23p, a[k + i4 * 1] = (a01m + a23m) * r;
                a[k + i4 * 2] = (a01p - a23p) * r2, a[k + i4 * 3] = (a01m - a23m) * r3;
            }
        }
    }
    if (bn & 1) {
        for (int i = 0; i < n / 2; ++i) {
            const auto a0 = a[i], a1 = a[i + n / 2];
            a[i] = a0 + a1, a[i + n / 2] = a0 - a1;
        }
    }
}

} // namespace detail

// FFT_n: A(x) |-> bit-reversed order of [A(1), A(zeta_n), ..., A(zeta_n^(n-1))]
template<typename Iterator> inline void fft_n(Iterator a, int n) {
    using Tp = typename std::iterator_traits<Iterator>::value_type;
    detail::butterfly_n(a, n, FftInfo<Tp>::get().root(n / 2));
}

template<typename Tp> inline void fft(std::vector<Tp> &a) { fft_n(a.begin(), a.size()); }

// IFFT_n: bit-reversed order of [A(1), A(zeta_n), ..., A(zeta_n^(n-1))] |-> A(x)
template<typename Iterator> inline void inv_fft_n(Iterator a, int n) {
    using Tp = typename std::iterator_traits<Iterator>::value_type;
    detail::inv_butterfly_n(a, n, FftInfo<Tp>::get().inv_root(n / 2));
    const Tp iv = Tp::mod() - (Tp::mod() - 1) / n;
    for (int i = 0; i < n; ++i) a[i] *= iv;
}

template<typename Tp> inline void inv_fft(std::vector<Tp> &a) { inv_fft_n(a.begin(), a.size()); }

// IFFT_n^T: A(x) |-> 1/n FFT_n((x^n A(x^(-1))) mod (x^n - 1))
template<typename Iterator> inline void transposed_inv_fft_n(Iterator a, int n) {
    using Tp    = typename std::iterator_traits<Iterator>::value_type;
    const Tp iv = Tp::mod() - (Tp::mod() - 1) / n;
    for (int i = 0; i < n; ++i) a[i] *= iv;
    detail::butterfly_n(a, n, FftInfo<Tp>::get().inv_root(n / 2));
}

template<typename Tp> inline void transposed_inv_fft(std::vector<Tp> &a) {
    transposed_inv_fft_n(a.begin(), a.size());
}

// FFT_n^T : FFT_n((x^n A(x^(-1))) mod (x^n - 1)) |-> n A(x)
template<typename Iterator> inline void transposed_fft_n(Iterator a, int n) {
    using Tp = typename std::iterator_traits<Iterator>::value_type;
    detail::inv_butterfly_n(a, n, FftInfo<Tp>::get().root(n / 2));
}

template<typename Tp> inline void transposed_fft(std::vector<Tp> &a) {
    transposed_fft_n(a.begin(), a.size());
}

template<typename Tp> inline std::vector<Tp> convolution_fft(std::vector<Tp> a, std::vector<Tp> b) {
    if (a.empty() || b.empty()) return {};
    const int n   = a.size();
    const int m   = b.size();
    const int len = fft_len(n + m - 1);
    a.resize(len);
    b.resize(len);
    fft(a);
    fft(b);
    for (int i = 0; i < len; ++i) a[i] *= b[i];
    inv_fft(a);
    a.resize(n + m - 1);
    return a;
}

template<typename Tp> inline std::vector<Tp> square_fft(std::vector<Tp> a) {
    if (a.empty()) return {};
    const int n   = a.size();
    const int len = fft_len(n * 2 - 1);
    a.resize(len);
    fft(a);
    for (int i = 0; i < len; ++i) a[i] *= a[i];
    inv_fft(a);
    a.resize(n * 2 - 1);
    return a;
}

template<typename Tp>
inline std::vector<Tp> convolution_naive(const std::vector<Tp> &a, const std::vector<Tp> &b) {
    if (a.empty() || b.empty()) return {};
    const int n = a.size();
    const int m = b.size();
    std::vector<Tp> res(n + m - 1);
    for (int i = 0; i < n; ++i)
        for (int j = 0; j < m; ++j) res[i + j] += a[i] * b[j];
    return res;
}

template<typename Tp>
inline std::vector<Tp> convolution(const std::vector<Tp> &a, const std::vector<Tp> &b) {
    if (std::min(a.size(), b.size()) < 60) return convolution_naive(a, b);
    if (std::addressof(a) == std::addressof(b)) return square_fft(a);
    return convolution_fft(a, b);
}
#line 2 "fps_basic.hpp"

#line 2 "semi_relaxed_conv.hpp"

#line 5 "semi_relaxed_conv.hpp"
#include <type_traits>
#include <utility>
#line 8 "semi_relaxed_conv.hpp"

template<typename Tp, typename Closure>
inline std::enable_if_t<std::is_invocable_r_v<Tp, Closure, int, const std::vector<Tp> &>,
                        std::vector<Tp>>
semi_relaxed_convolution_naive(const std::vector<Tp> &A, Closure gen, int n) {
    std::vector<Tp> B(n), AB(n);
    for (int i = 0; i < n; ++i) {
        for (int j = std::max(0, i - (int)A.size() + 1); j < i; ++j) AB[i] += A[i - j] * B[j];
        B[i] = gen(i, AB);
        if (!A.empty()) AB[i] += A[0] * B[i];
    }
    return B;
}

// returns coefficients generated by closure
// closure: gen(index, current_product)
template<typename Tp, typename Closure>
inline std::enable_if_t<std::is_invocable_r_v<Tp, Closure, int, const std::vector<Tp> &>,
                        std::vector<Tp>>
semi_relaxed_convolution(const std::vector<Tp> &A, Closure gen, int n) {
    if (A.size() < 60) return semi_relaxed_convolution_naive(A, gen, n);
    enum { BaseCaseSize = 32 };
    static_assert((BaseCaseSize & (BaseCaseSize - 1)) == 0);

    static const int Block[]     = {16, 16, 16, 16, 16};
    static const int BlockSize[] = {
        BaseCaseSize,
        BaseCaseSize * Block[0],
        BaseCaseSize * Block[0] * Block[1],
        BaseCaseSize * Block[0] * Block[1] * Block[2],
        BaseCaseSize * Block[0] * Block[1] * Block[2] * Block[3],
        BaseCaseSize * Block[0] * Block[1] * Block[2] * Block[3] * Block[4],
    };

    // returns (which_block, level)
    auto blockinfo = [](int ind) {
        int i = ind / BaseCaseSize, lv = 0;
        while ((i & (Block[lv] - 1)) == 0) i /= Block[lv++];
        return std::make_pair(i & (Block[lv] - 1), lv);
    };

    std::vector<Tp> B(n), AB(n);
    std::vector<std::vector<std::vector<Tp>>> dftA, dftB;

    for (int i = 0; i < n; ++i) {
        const int s = i & (BaseCaseSize - 1);

        // block contribution
        if (i >= BaseCaseSize && s == 0) {
            const auto [j, lv]  = blockinfo(i);
            const int blocksize = BlockSize[lv];

            if (blocksize * j == i) {
                if ((int)dftA.size() == lv) {
                    dftA.emplace_back();
                    dftB.emplace_back(Block[lv] - 1);
                }
                if ((j - 1) * blocksize < (int)A.size()) {
                    dftA[lv]
                        .emplace_back(A.begin() + (j - 1) * blocksize,
                                      A.begin() + std::min<int>((j + 1) * blocksize, A.size()))
                        .resize(blocksize * 2);
                    fft(dftA[lv][j - 1]);
                }
            }

            if (!dftA[lv].empty()) {
                dftB[lv][j - 1].resize(blocksize * 2);
                std::copy_n(B.begin() + (i - blocksize), blocksize, dftB[lv][j - 1].begin());
                std::fill_n(dftB[lv][j - 1].begin() + blocksize, blocksize, Tp(0));
                fft(dftB[lv][j - 1]);

                // middle product
                std::vector<Tp> mp(blocksize * 2);
                for (int k = 0; k < std::min<int>(j, dftA[lv].size()); ++k)
                    for (int l = 0; l < blocksize * 2; ++l)
                        mp[l] += dftA[lv][k][l] * dftB[lv][j - 1 - k][l];
                inv_fft(mp);

                for (int k = 0; k < blocksize && i + k < n; ++k) AB[i + k] += mp[k + blocksize];
            }
        }

        // basecase contribution
        for (int j = std::max(i - s, i - (int)A.size() + 1); j < i; ++j) AB[i] += A[i - j] * B[j];
        B[i] = gen(i, AB);
        if (!A.empty()) AB[i] += A[0] * B[i];
    }

    return B;
}
#line 8 "fps_basic.hpp"

template<typename Tp> inline int order(const std::vector<Tp> &a) {
    for (int i = 0; i < (int)a.size(); ++i)
        if (a[i] != 0) return i;
    return -1;
}

template<typename Tp> inline std::vector<Tp> fps_inv(const std::vector<Tp> &a, int n) {
    assert(order(a) == 0);
    if (n <= 0) return {};
    if (std::min<int>(a.size(), n) < 60)
        return semi_relaxed_convolution(
            a, [v = a[0].inv()](int n, auto &&c) { return n == 0 ? v : -c[n] * v; }, n);
    enum { Threshold = 32 };
    const int len = fft_len(n);
    std::vector<Tp> invA, shopA(len), shopB(len);
    invA = semi_relaxed_convolution(
        a, [v = a[0].inv()](int n, auto &&c) { return n == 0 ? v : -c[n] * v; }, Threshold);
    invA.resize(len);
    for (int i = Threshold * 2; i <= len; i *= 2) {
        std::fill(std::copy_n(a.begin(), std::min<int>(a.size(), i), shopA.begin()),
                  shopA.begin() + i, Tp(0));
        std::copy_n(invA.begin(), i, shopB.begin());
        fft_n(shopA.begin(), i);
        fft_n(shopB.begin(), i);
        for (int j = 0; j < i; ++j) shopA[j] *= shopB[j];
        inv_fft_n(shopA.begin(), i);
        std::fill_n(shopA.begin(), i / 2, Tp(0));
        fft_n(shopA.begin(), i);
        for (int j = 0; j < i; ++j) shopA[j] *= shopB[j];
        inv_fft_n(shopA.begin(), i);
        for (int j = i / 2; j < i; ++j) invA[j] = -shopA[j];
    }
    invA.resize(n);
    return invA;
}

template<typename Tp>
inline std::vector<Tp> fps_div(const std::vector<Tp> &a, const std::vector<Tp> &b, int n) {
    assert(order(b) == 0);
    if (n <= 0) return {};
    return semi_relaxed_convolution(
        b,
        [&, v = b[0].inv()](int n, auto &&c) {
            if (n < (int)a.size()) return (a[n] - c[n]) * v;
            return -c[n] * v;
        },
        n);
}

template<typename Tp> inline std::vector<Tp> deriv(const std::vector<Tp> &a) {
    const int n = (int)a.size() - 1;
    if (n <= 0) return {};
    std::vector<Tp> res(n);
    for (int i = 1; i <= n; ++i) res[i - 1] = a[i] * i;
    return res;
}

template<typename Tp> inline std::vector<Tp> integr(const std::vector<Tp> &a, Tp c = {}) {
    const int n = a.size() + 1;
    auto &&bin  = Binomial<Tp>::get(n);
    std::vector<Tp> res(n);
    res[0] = c;
    for (int i = 1; i < n; ++i) res[i] = a[i - 1] * bin.inv(i);
    return res;
}

template<typename Tp> inline std::vector<Tp> fps_log(const std::vector<Tp> &a, int n) {
    return integr(fps_div(deriv(a), a, n - 1));
}

template<typename Tp> inline std::vector<Tp> fps_exp(const std::vector<Tp> &a, int n) {
    if (n <= 0) return {};
    assert(a.empty() || a[0] == 0);
    return semi_relaxed_convolution(
        deriv(a),
        [bin = Binomial<Tp>::get(n)](int n, auto &&c) {
            return n == 0 ? Tp(1) : c[n - 1] * bin.inv(n);
        },
        n);
}

template<typename Tp> inline std::vector<Tp> fps_pow(std::vector<Tp> a, long long e, int n) {
    if (n <= 0) return {};
    if (e == 0) {
        std::vector<Tp> res(n);
        res[0] = 1;
        return res;
    }

    const int o = order(a);
    if (o < 0 || o > n / e || (o == n / e && n % e == 0)) return std::vector<Tp>(n);
    if (o != 0) a.erase(a.begin(), a.begin() + o);

    const Tp ia0 = a[0].inv();
    const Tp a0e = a[0].pow(e);
    const Tp me  = e;

    for (int i = 0; i < (int)a.size(); ++i) a[i] *= ia0;
    a = fps_log(a, n - o * e);
    for (int i = 0; i < (int)a.size(); ++i) a[i] *= me;
    a = fps_exp(a, n - o * e);
    for (int i = 0; i < (int)a.size(); ++i) a[i] *= a0e;

    a.insert(a.begin(), o * e, Tp(0));
    return a;
}
#line 10 "fps_composition.hpp"

// returns f(g) mod x^n
// see:
// [1]: Yasunori Kinoshita, Baitian Li. Power Series Composition in Near-Linear Time.
//      https://arxiv.org/abs/2404.05177
template<typename Tp>
inline std::vector<Tp> composition(const std::vector<Tp> &f, const std::vector<Tp> &g, int n) {
    if (n <= 0) return {};
    if (g.empty()) {
        std::vector<Tp> res(n);
        if (!f.empty()) res[0] = f[0];
        return res;
    }

    // [y^(-1)] (f(y) / (-g(x) + y)) mod x^n in R[x]((y^(-1)))
    auto rec = [g0 = g[0]](auto &&rec, const std::vector<Tp> &P, const std::vector<Tp> &Q, int d,
                           int n) {
        if (n == 1) {
            std::vector<Tp> invQ(d + 1);
            auto &&bin = Binomial<Tp>::get(d * 2);
            Tp gg      = 1;
            for (int i = 0; i <= d; ++i) invQ[d - i] = bin.binom(d + i - 1, d - 1) * gg, gg *= g0;
            // invQ[i] = [y^(-2d + i)]Q^(-1)
            // P[0,d-1] * invQ[-2d,-d] => [0,d-1] * [0,d]
            // take [-d,-1] => take [d,2d-1]
            auto PinvQ = convolution(P, invQ);
            PinvQ.erase(PinvQ.begin(), PinvQ.begin() + d);
            PinvQ.resize(d);
            return PinvQ;
        }

        std::vector<Tp> dftQ(d * n * 4);
        for (int i = 0; i < d; ++i)
            for (int j = 0; j < n; ++j) dftQ[i * (n * 2) + j] = Q[i * n + j];
        dftQ[d * n * 2] = 1;
        fft(dftQ);
        std::vector<Tp> V(d * n * 2);
        for (int i = 0; i < d * n * 4; i += 2) V[i / 2] = dftQ[i] * dftQ[i + 1];
        inv_fft(V);
        V[0] -= 1;

        for (int i = 1; i < d * 2; ++i)
            for (int j = 0; j < n / 2; ++j) V[i * (n / 2) + j] = V[i * n + j];
        V.resize(d * n);

        const auto T = rec(rec, P, std::move(V), d * 2, n / 2);

        std::vector<Tp> dftT(d * n * 2);
        for (int i = 0; i < d * 2; ++i)
            for (int j = 0; j < n / 2; ++j) dftT[i * n + j] = T[i * (n / 2) + j];
        fft(dftT);

        std::vector<Tp> U(d * n * 4);
        for (int i = 0; i < d * n * 4; i += 2) {
            U[i]     = dftT[i / 2] * dftQ[i + 1];
            U[i + 1] = dftT[i / 2] * dftQ[i];
        }
        inv_fft(U);

        // [-2d,d-1] => [0,3d-1]
        // take [-d,-1] => take [d,2d-1]
        for (int i = 0; i < d; ++i)
            for (int j = 0; j < n; ++j) U[i * n + j] = U[(i + d) * (n * 2) + j];
        U.resize(d * n);
        return U;
    };

    const int k = fft_len(std::max<int>(n, f.size()));
    std::vector<Tp> Q(k);
    for (int i = 0; i < std::min<int>(k, g.size()); ++i) Q[i] = -g[i];

    auto res = rec(rec, f, Q, 1, k);
    res.resize(n);
    return res;
}

// returns [x^k]gf^0, [x^k]gf, ..., [x^k]gf^(n-1)
// see:
// [1]: noshi91. FPS の合成と逆関数、冪乗の係数列挙 Θ(n (log(n))^2)
//      https://noshi91.hatenablog.com/entry/2024/03/16/224034
template<typename Tp> inline std::vector<Tp>
enum_kth_term_of_power(const std::vector<Tp> &f, const std::vector<Tp> &g, int k, int n) {
    if (k < 0 || n <= 0) return {};
    if (f.empty()) {
        std::vector<Tp> res(n);
        if (k < (int)g.size()) res[0] = g[k];
        return res;
    }

    // [x^k] (g(x) / (-f(x) + y)) in R[x]((y^(-1)))
    std::vector<Tp> P(g), Q(k + 1);
    P.resize(k + 1);
    for (int i = 0; i < std::min<int>(k + 1, f.size()); ++i) Q[i] = -f[i];

    int d = 1;
    for (; k; d *= 2, k /= 2) {
        const int len = fft_len((d * 2) * ((k + 1) * 2) - 1);
        std::vector<Tp> dftP(len), dftQ(len);
        for (int i = 0; i < d; ++i)
            for (int j = 0; j <= k; ++j) {
                dftP[i * ((k + 1) * 2) + j] = P[i * (k + 1) + j];
                dftQ[i * ((k + 1) * 2) + j] = Q[i * (k + 1) + j];
            }
        dftQ[d * (k + 1) * 2] = 1;
        fft(dftP);
        fft(dftQ);

        P.resize(len / 2);
        Q.resize(len / 2);
        if (k & 1) {
            auto &&root = FftInfo<Tp>::get().inv_root(len / 2);
            for (int i = 0; i < len; i += 2) {
                P[i / 2] = (dftP[i] * dftQ[i + 1] - dftP[i + 1] * dftQ[i]).div_by_2() * root[i / 2];
                Q[i / 2] = dftQ[i] * dftQ[i + 1];
            }
        } else {
            for (int i = 0; i < len; i += 2) {
                P[i / 2] = (dftP[i] * dftQ[i + 1] + dftP[i + 1] * dftQ[i]).div_by_2();
                Q[i / 2] = dftQ[i] * dftQ[i + 1];
            }
        }
        inv_fft(P);
        inv_fft(Q);
        if (d * (k + 1) * 4 >= len) Q[(d * (k + 1) * 4) % len] -= 1;

        for (int i = 1; i < d * 2; ++i)
            for (int j = 0; j <= k / 2; ++j) {
                P[i * (k / 2 + 1) + j] = P[i * (k + 1) + j];
                Q[i * (k / 2 + 1) + j] = Q[i * (k + 1) + j];
            }
        P.resize(d * 2 * (k / 2 + 1));
        Q.resize(d * 2 * (k / 2 + 1));
    }

    std::vector<Tp> invQ(n + 1);
    auto &&bin = Binomial<Tp>::get(d + n);
    Tp ff      = 1;
    for (int i = 0; i <= n; ++i) invQ[n - i] = bin.binom(d + i - 1, d - 1) * ff, ff *= f[0];
    // invQ[i] = [y^(-2d + i)]Q^(-1)
    // P[0,d-1] * invQ[-(d+n),-d] => [0,d-1] * [0,n]
    auto PinvQ = convolution(P, invQ);
    // take [-n,-1] => take [d,d+n-1]
    PinvQ.erase(PinvQ.begin(), PinvQ.begin() + d);
    PinvQ.resize(n);
    // output => [-1,-n] reverse
    // before I just reverse it and mistaken something.
    std::reverse(PinvQ.begin(), PinvQ.end());
    return PinvQ;
}

// returns g s.t. f(g) = g(f) = x mod x^n
template<typename Tp> inline std::vector<Tp> reversion(std::vector<Tp> f, int n) {
    if (n <= 0 || f.size() < 2) return {};
    assert(order(f) == 1);
    const auto if1 = f[1].inv();
    if (n == 1) return {Tp(0)};
    f.resize(n);
    Tp ff = 1;
    for (int i = 1; i < n; ++i) f[i] *= ff *= if1;
    auto a     = enum_kth_term_of_power(f, {Tp(1)}, n - 1, n);
    auto &&bin = Binomial<Tp>::get(n);
    for (int i = 1; i < n; ++i) a[i] *= (n - 1) * bin.inv(i);
    auto b = fps_pow(std::vector(a.rbegin(), a.rend() - 1), Tp(1 - n).inv().val(), n - 1);
    for (int i = 0; i < n - 1; ++i) b[i] *= if1;
    b.insert(b.begin(), Tp(0));
    return b;
}

Formal Power Series Composition (fps_composition.hpp)