第二章:环境准备
# 查看 Python 版本
!python --version
Python 3.10.18
# 查看 Llama-Index 版本
!pip list | grep llama-index
llama-index 0.14.10 llama-index-callbacks-arize-phoenix 0.6.1 llama-index-cli 0.5.3 llama-index-core 0.14.10 llama-index-embeddings-adapter 0.4.1 llama-index-embeddings-dashscope 0.4.1 llama-index-embeddings-huggingface 0.6.1 llama-index-embeddings-huggingface-optimum 0.4.1 llama-index-embeddings-modelscope 0.5.1 llama-index-embeddings-openai 0.5.1 llama-index-experimental 0.6.3 llama-index-finetuning 0.4.1 llama-index-indices-managed-llama-cloud 0.9.4 llama-index-instrumentation 0.4.2 llama-index-llms-azure-openai 0.4.2 llama-index-llms-dashscope 0.5.1 llama-index-llms-deepseek 0.2.2 llama-index-llms-mistralai 0.7.1 llama-index-llms-ollama 0.9.0 llama-index-llms-openai 0.6.5 llama-index-llms-openai-like 0.5.3 llama-index-multi-modal-llms-openai 0.6.2 llama-index-postprocessor-cohere-rerank 0.5.1 llama-index-readers-feishu-docs 0.4.1 llama-index-readers-file 0.5.4 llama-index-readers-llama-parse 0.5.1 llama-index-retrievers-bm25 0.6.5 llama-index-storage-docstore-mongodb 0.4.1 llama-index-storage-index-store-mongodb 0.5.1 llama-index-storage-kvstore-mongodb 0.4.1 llama-index-tools-vector-db 0.4.1 llama-index-vector-stores-chroma 0.5.3 llama-index-vector-stores-faiss 0.5.1 llama-index-vector-stores-milvus 0.9.3 llama-index-vector-stores-pinecone 0.7.1 llama-index-workflows 2.8.3 openinference-instrumentation-llama-index 4.3.9
# 安装依赖
!pip install -q llama-index-core llama-index-llms-openai llama-index-embeddings-openai llama-index-llms-dashscope llama-index-embeddings-dashscope
!pip install -q llama-index-vector-stores-milvus llama-index-retrievers-bm25 rank-bm25 llama-parse
!pip install -q sqlalchemy pandas python-dotenv
print("✅ 依赖安装完成")
✅ 依赖安装完成
# 全局配置
import os
import warnings
from dotenv import load_dotenv
from llama_index.llms.dashscope import DashScope
from llama_index.embeddings.dashscope import DashScopeEmbedding
from llama_index.core.settings import Settings
warnings.filterwarnings('ignore')
# 加载环境变量
load_dotenv(override=True)
# 设置 LLM
Settings.llm = DashScope(
model_name="qwen-max",
api_key=os.getenv("DASHSCOPE_API_KEY"),
api_base=os.getenv("DASHSCOPE_BASE_URL", "https://api.dashscope.aliyuncs.com")
)
# 设置 Embedding模型
Settings.embed_model = DashScopeEmbedding(
model_name="text-embedding-v4",
api_key=os.getenv("DASHSCOPE_API_KEY"),
api_base=os.getenv("DASHSCOPE_BASE_URL", "https://api.dashscope.aliyuncs.com")
)
print("✅ LLM配置完成")
print(f" 模型: {Settings.llm.model_name}")
print(f" Embedding: {Settings.embed_model.model_name}")
✅ LLM配置完成 模型: qwen-max Embedding: text-embedding-v4
# 基本使用
response = Settings.llm.complete("你好,请介绍一下通义千问。")
print(response)
response_embed = Settings.embed_model.get_query_embedding("你好,请介绍一下通义千问。")
print(response_embed)
你好!通义千问是阿里云自主研发的超大规模语言模型,能够回答问题、创作文字,还能表达观点、撰写代码。作为一个AI助手,我的目标是帮助用户获得所需的信息,解决他们的问题,并提供积极的建议和支持。如果您有任何问题或需要帮助,请随时告诉我,我会尽力提供支持。 [-0.06506925076246262, -0.00677934056147933, 0.03812263160943985, 0.0034595204051584005, 0.01659327931702137, -0.0216690544039011, 0.02269352227449417, 0.10281934589147568, -0.08021896332502365, 0.12219110876321793, 0.055290237069129944, 0.03675667569041252, 0.04448675364255905, -0.04991953819990158, -0.05131654068827629, -0.04470406472682953, 0.016251789405941963, -0.013783752918243408, -0.012883462943136692, 0.007310977205634117, 0.012891224585473537, 0.00312191154807806, 0.024028435349464417, 0.05317921191453934, -0.021855320781469345, -0.0036671303678303957, 0.03483191877603531, -0.02773825079202652, -0.012642867863178253, -0.010407664813101292, -0.005525919143110514, -0.004831298720091581, 0.022988444194197655, 0.050509385764598846, 0.05004371702671051, 0.05209265276789665, -0.012316901236772537, -0.015335978008806705, 0.0011651384411379695, 0.008544995449483395, -0.004322945140302181, 0.02329888939857483, 0.005440546665340662, -0.0225538220256567, -0.012836896814405918, -0.003457579994574189, -0.042779307812452316, 0.002274008933454752, -0.029073163866996765, -0.04572853446006775, -0.007889180444180965, 0.02483559213578701, -0.020349662750959396, -0.03141702339053154, -0.04765329137444496, 0.02902659773826599, 0.0017802073853090405, 0.003263551974669099, -0.006022630725055933, 0.02319023385643959, 0.04038888216018677, 0.04482824355363846, -0.04349333047866821, 0.023671424016356468, 0.015056577511131763, -0.03455251827836037, -0.011897800490260124, -0.029212864115834236, -0.014753893949091434, -0.034769829362630844, 0.02205711044371128, 0.01597238890826702, 0.06624893844127655, 4.304997491999529e-05, -0.01809893734753132, 0.030578821897506714, -0.04560435563325882, 0.0015415529487654567, -0.007101426832377911, -0.0026795275043696165, 0.02267799898982048, 0.015335978008806705, -0.005886811297386885, -0.017835058271884918, 0.03861934319138527, 0.03877456858754158, -0.047063447535037994, 0.05001267418265343, 0.010430948808789253, 0.06246151402592659, 0.022848743945360184, -0.0003199037746526301, -0.01304644625633955, 0.036787718534469604, 0.038215767592191696, -0.004086230881512165, -0.0036710109561681747, 0.011424371972680092, -0.007776644546538591, 0.007675750181078911, -0.022771133109927177, -0.03815367817878723, -0.02582901529967785, 0.03622891753911972, -0.009127079509198666, -0.014101959764957428, -0.024742458015680313, -0.03886770084500313, -0.009600508026778698, 0.01084228791296482, -0.01367509737610817, 0.009841103106737137, -0.020846374332904816, 0.030873743817210197, 0.018580127507448196, 0.020427273586392403, -0.01898370496928692, -0.010384381748735905, 0.011207060888409615, 0.031618811190128326, -0.02570483647286892, 0.040109481662511826, 0.033776406198740005, -0.014753893949091434, 0.01788162626326084, 0.0009657746413722634, 0.013108535669744015, 0.021343085914850235, 0.04308975115418434, 0.018797438591718674, -0.01406315341591835, -0.017291780561208725, 0.008902007713913918, -0.00022737662948202342, -0.05373801290988922, 0.13287042081356049, -0.0445798859000206, -0.0026077372021973133, -0.06910503655672073, -0.025937670841813087, 0.019573550671339035, -0.0727061927318573, -0.02623259276151657, 0.01924758404493332, -0.055817991495132446, 0.0234385896474123, 0.05941915139555931, -0.01812998205423355, -0.015941346064209938, 0.009127079509198666, -0.049702227115631104, 0.027194973081350327, -0.031339410692453384, 0.003135493490844965, -0.04367959499359131, -0.023469634354114532, -0.0009880878496915102, 0.009810058400034904, 0.009988564066588879, -0.04979535937309265, -0.036042653024196625, 0.015646422281861305, 0.03138598054647446, -0.013147341087460518, -0.002155651804059744, -0.05358278751373291, 0.05904661864042282, -0.09598956257104874, -0.04231363907456398, 0.006666804198175669, -0.02430783584713936, 0.007349783089011908, 0.03141702339053154, -0.018642215058207512, 0.033124469220638275, -0.0035739969462156296, 0.024230223149061203, 0.005153385456651449, 0.005498755257576704, -0.07351335138082504, 0.012875702232122421, -0.0013572262832894921, 0.10132921487092972, 0.00712859071791172, 0.027024228125810623, 0.01924758404493332, -0.015405828133225441, -0.009328869171440601, 0.022646954283118248, 0.013861364684998989, 0.01380703691393137, 0.10679304599761963, -0.004051305819302797, 0.055445458739995956, 0.08052940666675568, 0.03737756609916687, -0.016888203099370003, -0.00816470105201006, 0.1374650001525879, -0.01834729313850403, 0.012891224585473537, -0.019092360511422157, -0.006895757280290127, 0.018657738342881203, -0.014971205033361912, -0.01622074656188488, 0.024400968104600906, -0.012503168545663357, -0.014482254162430763, -0.014854787848889828, -0.0016657308442518115, 0.0021149057429283857, 0.006919040810316801, 0.004144439473748207, -0.03265880420804024, 0.0014620014699175954, -0.017027903348207474, 0.04355541989207268, 0.012961074709892273, -0.011866755783557892, -0.0003662279632408172, -0.008180223405361176, -0.03334178030490875, 0.02497529238462448, -0.02165353111922741, 0.012053023092448711, -0.015141949988901615, 0.02848331816494465, 0.0077029140666127205, 0.02988032065331936, -0.037688009440898895, -0.02294187806546688, -0.07686615735292435, -0.012316901236772537, 0.01799028180539608, -0.02648094855248928, -0.03619787469506264, 0.008754545822739601, -0.01431150920689106, 0.027024228125810623, 0.015235083177685738, 0.0387435220181942, -0.030625388026237488, 0.043151840567588806, -0.015646422281861305, 0.0034304161090403795, -0.014955682680010796, -0.024478580802679062, -0.07804584503173828, 0.038681432604789734, -0.03259671479463577, -0.0006242852541618049, -0.012580779381096363, 0.030097631737589836, -0.053644876927137375, -0.0003007435007020831, -0.019464895129203796, -0.0043695117346942425, -0.011323477141559124, -0.016500145196914673, -0.004606225993484259, 0.049857448786497116, 0.016655368730425835, -0.05823946371674538, -0.0316653810441494, 0.020054740831255913, 0.018549082800745964, 0.01965116150677204, -0.039612770080566406, -0.0021284879185259342, -0.008568279445171356, -0.030734045431017876, 0.013543158769607544, 0.027443328872323036, 0.015118665993213654, -0.01952698454260826, -0.02876271866261959, 0.02303501032292843, -0.02446305751800537, -0.028157351538538933, -0.012829135172069073, -0.01419509295374155, -0.01177362259477377, -0.0006005168543197215, 0.00942976400256157, 0.023671424016356468, 0.00449757045134902, 0.07307872921228409, -0.01337241381406784, -0.026418861001729965, 0.02089294046163559, 0.07314081490039825, -0.05317921191453934, 0.026030804961919785, -0.01763327047228813, 0.025440959259867668, 0.023640379309654236, 0.025782449170947075, 0.01799028180539608, 0.01785058155655861, -0.03495609387755394, -0.03040807694196701, -0.018921615555882454, -0.0245872363448143, 0.00038369049434550107, -0.01940280571579933, 0.012611824087798595, -0.002881316700950265, 0.06128182262182236, -0.02216576598584652, -0.05007475987076759, -0.0574943944811821, 0.06205793470144272, -0.029802709817886353, 0.06537969410419464, -0.030361510813236237, -0.012573018670082092, 0.0012068544747307897, -0.015436871908605099, 0.00987990852445364, -0.08903559297323227, -0.031975824385881424, -0.05075773969292641, 0.0053396522998809814, 0.06370329111814499, -0.0010361098684370518, 0.016065523028373718, -0.05945019796490669, 0.02041175216436386, 0.013077490963041782, -0.02582901529967785, 0.04116499423980713, -0.002330276882275939, 0.062275245785713196, -0.008630367927253246, -0.03688085451722145, -0.00016601526294834912, 0.03511131927371025, 0.0234385896474123, -0.017198646441102028, -0.013636291958391666, 0.012309139594435692, -0.02204158715903759, 0.005056371446698904, 0.044021084904670715, -0.011874517425894737, -0.033124469220638275, 0.007947389036417007, -0.05665619298815727, 0.02938360907137394, 0.00447040656581521, 0.009072751738131046, 0.00012060056178597733, 0.06283404678106308, -0.0069539654068648815, 0.010120503604412079, 0.04293452948331833, 0.006305912043899298, 0.01538254413753748, -0.00031893362756818533, -0.02887137606739998, 0.014707326889038086, -0.005331891123205423, -0.027707206085324287, 0.009018423967063427, -0.06569013744592667, -0.008444101549685001, 0.0050912960432469845, -0.015615378506481647, 0.0481189601123333, -0.004012500401586294, 0.0034692217595875263, 0.005413382779806852, 0.01951146125793457, 0.022724566981196404, -0.022771133109927177, 0.006833668332546949, 0.01152526680380106, -0.03843307867646217, 0.004043544642627239, 0.052372053265571594, -0.011098405346274376, 0.01563866063952446, -0.02241412177681923, 0.0691671222448349, -0.0029725099448114634, -0.0004938014317303896, 0.02269352227449417, 0.04880193620920181, 0.028917942196130753, 0.02888689748942852, 0.02495976909995079, 0.007109188009053469, -0.02126547507941723, 0.04709449037909508, 0.04613211005926132, 0.009367674589157104, -0.010912138037383556, 0.014606432057917118, 0.011897800490260124, -0.0310444887727499, -0.015491200610995293, 0.0006310762837529182, 0.056687235832214355, -0.02584453672170639, 0.0069229211658239365, 0.011641683988273144, 0.007877538911998272, 0.03014419972896576, 0.026822438463568687, -0.06848414242267609, 0.011176016181707382, 0.06780116260051727, 0.024866636842489243, -0.008405295200645924, 0.023003967478871346, -0.03492505103349686, -0.012774807401001453, 0.044921375811100006, 0.04597688838839531, -0.05445203557610512, 0.04675300046801567, -0.026667216792702675, -0.014652999117970467, -0.07779748737812042, 0.09654836356639862, 8.112798241199926e-05, 0.010058414191007614, 0.019589072093367577, 0.0024078881833702326, -0.07680406421422958, -0.06432418525218964, 0.010368859395384789, -0.02519260346889496, 0.013900170102715492, 0.018968183547258377, 0.04153752699494362, -0.05622157081961632, -0.049609094858169556, 0.011517505161464214, -0.0516580305993557, 0.004400556441396475, -0.03352804854512215, -0.013077490963041782, -0.007640825118869543, 0.033497005701065063, 0.0013271518982946873, 0.0176022257655859, -0.024742458015680313, 0.03168090060353279, -0.0025301259011030197, -0.02291083335876465, 0.016748502850532532, 0.017943715676665306, 0.017447002232074738, 0.019697727635502815, -0.025285735726356506, -0.003731159493327141, -0.025891104713082314, 0.00981782004237175, 0.015467916615307331, -0.02849884144961834, -0.029073163866996765, 0.03408684954047203, -0.00918916892260313, 0.020753242075443268, 0.015941346064209938, 0.012782568112015724, 0.033497005701065063, -0.007120829541236162, 0.010671542957425117, -0.015786122530698776, -0.022895310074090958, -0.006678445730358362, -0.006042033899575472, -0.011424371972680092, -0.048584625124931335, 0.005634574685245752, -0.008933051489293575, 0.08158491551876068, 0.002066398737952113, 0.003925187513232231, -0.02697766199707985, -0.008979618549346924, -0.05659410357475281, 0.024928724393248558, -0.004447123035788536, -0.033776406198740005, 0.0057083056308329105, -0.07444468885660172, 0.008622607216238976, -0.04787060245871544, 0.015110905282199383, 0.02662064880132675, 0.0026290800888091326, 0.01337241381406784, -0.032379403710365295, -0.016546713188290596, 0.022833222523331642, -0.00020955030049663037, 0.027691684663295746, 0.013434503227472305, -0.028405707329511642, -0.021063685417175293, 0.038960833102464676, -0.006926801521331072, -0.05321025475859642, 0.030314944684505463, 0.012115111574530602, -0.009577224962413311, 0.013457786291837692, -0.0005127191543579102, -0.005296966060996056, -0.01569299027323723, 0.0037156373728066683, 0.023112623021006584, 0.04355541989207268, 0.014924637973308563, -0.01623626798391342, -0.011696011759340763, 0.01710551418364048, 0.020861897617578506, -0.06711818277835846, -0.012712717987596989, -0.031370457261800766, -0.0008197685237973928, 0.04790164902806282, -0.014280465431511402, 0.05494874715805054, -0.0022216213401407003, 0.012363468296825886, -0.033124469220638275, -0.004031903110444546, -0.009266779758036137, -0.02356276661157608, -0.006457253824919462, 0.031975824385881424, 0.0027998248115181923, -0.057960063219070435, 0.0397990345954895, 0.048832982778549194, 0.014210615307092667, 0.008125894702970982, 0.048708803951740265, 0.016531189903616905, -0.04672195762395859, 0.037315476685762405, -0.05336547642946243, -0.0064805373549461365, -0.023888735100626945, 0.016267312690615654, 0.04305870831012726, 0.0029220625292509794, -0.012642867863178253, 0.00836648978292942, 0.007675750181078911, -0.04141334816813469, 0.024137090891599655, 0.05246518552303314, -0.0491744726896286, -0.018409382551908493, -0.012037500739097595, 0.0013426741352304816, -0.0334349162876606, -0.00942976400256157, -0.029585398733615875, -0.028591975569725037, 0.017276259139180183, -0.008087089285254478, 0.02700870670378208, -0.009026185609400272, 0.02142069861292839, 0.029073163866996765, 0.005692783277481794, 0.005727708339691162, -0.005991586484014988, -0.01735386997461319, 0.015025532804429531, 0.046535689383745193, -0.020318618044257164, -0.053023986518383026, -0.019930562004446983, 0.01722969114780426, -0.018673259764909744, 0.04066828265786171, 0.03877456858754158, -0.010966465808451176, 0.0034342966973781586, -0.015118665993213654, -0.011509744450449944, -0.010221398435533047, 0.001973265316337347, -0.05019893869757652, 0.011843472719192505, -0.045914798974990845, 0.018828483298420906, 0.12094932794570923, 0.012829135172069073, 0.00651934277266264, -0.0026717663276940584, -0.013341369107365608, -0.0006684266845695674, 0.005766513757407665, 0.03480087220668793, 0.004373392555862665, -0.010850048623979092, 0.007058740593492985, 0.031975824385881424, 0.019061315804719925, 0.006181734148412943, 0.051875341683626175, -0.0167174581438303, -0.05755648389458656, 0.002741616452112794, -0.009336629882454872, -0.048708803951740265, -0.006111884023994207, 0.022600388154387474, 0.008902007713913918, -0.053893234580755234, -0.00500592403113842, 0.011672727763652802, -0.01876639388501644, -0.005382338538765907, 0.009026185609400272, 0.01348106935620308, 0.016810590401291847, 0.013061968609690666, -0.01482374407351017, -0.008156939409673214, 0.002126547507941723, 0.05569381266832352, -0.059481240808963776, 0.006375762168318033, 0.02342306636273861, -0.03306237980723381, 0.00924349669367075, -0.024028435349464417, 0.012014217674732208, 0.0014319270849227905, 0.012301378883421421, -0.0006228300626389682, -0.043524373322725296, -0.0380294993519783, -0.018331771716475487, 0.041599616408348083, -0.001676402403973043, -0.02607737109065056, 0.01799028180539608, 0.012270334176719189, 0.0029201223514974117, 0.023904256522655487, 0.035514894872903824, 0.04054410383105278, 0.03703607618808746, 0.008731262758374214, 0.00449757045134902, 0.020598018541932106, 0.020551452413201332, 0.017012380063533783, -0.04088559374213219, -0.02239859849214554, 0.01551448367536068, -0.005638455506414175, 0.0521857850253582, -0.020458318293094635, -0.04787060245871544, -0.05246518552303314, 0.004260856192559004, -0.015824928879737854, -0.004769209772348404, -0.012340184301137924, 0.011959889903664589, -0.04951595887541771, -0.014218376018106937, 0.03632205352187157, 0.02660512737929821, 0.01736939139664173, 0.04305870831012726, 0.006631879135966301, -0.03585638478398323, 0.019356239587068558, -0.0008086119196377695, -0.04231363907456398, -0.03154120221734047, 0.022972922772169113, -0.05569381266832352, -0.02697766199707985, 0.0037660845555365086, -0.01557657215744257, -0.0074817221611738205, 0.04585270956158638, 0.0016754323150962591, 0.0075632138177752495, -0.019682206213474274, -0.025440959259867668, -0.047560159116983414, 0.017835058271884918, 0.02556513622403145, 0.028079740703105927, 0.004656673409044743, 0.015592094510793686, 0.053520698100328445, 0.03511131927371025, 0.0193717610090971, -0.04206528142094612, 0.02649647183716297, 0.004540256690233946, 0.008964096195995808, -0.003585638478398323, -0.004792492836713791, 0.004532495513558388, 0.025301259011030197, -0.0043695117346942425, -0.005754872225224972, 0.004423839971423149, 0.029212864115834236, 0.004326825495809317, 0.0054871137253940105, -0.004004739224910736, -0.022460687905550003, 0.02660512737929821, 0.0014164048479869962, 0.05488665774464607, -0.05445203557610512, -0.018067892640829086, 0.006173972971737385, -0.015196277759969234, 0.02256934344768524, 0.019697727635502815, -0.04116499423980713, -0.028654063120484352, 0.03846412152051926, -0.0002391395828453824, 0.0037990694399923086, -0.023655900731682777, -0.026636172086000443, -0.0045984648168087006, 0.0231747105717659, -0.019604595378041267, -0.057711705565452576, 0.0036593691911548376, 0.047342848032712936, 0.006589192897081375, 0.18341083824634552, -0.005386218894273043, 0.017648791894316673, -0.003496385645121336, 0.03177403658628464, -0.04600793495774269, 0.015739556401968002, -0.009422002360224724, -0.07587273418903351, 0.023003967478871346, -0.0008867082069627941, 0.01724521443247795, 0.034769829362630844, -0.03064091131091118, -0.04169274866580963, -0.012029740028083324, -0.007182918954640627, -0.029926888644695282, 0.013473308645188808, 0.005630694329738617, 0.005766513757407665, 0.010966465808451176, 0.0033799686934798956, 0.009064991027116776, -0.012394512072205544, -0.008839918300509453, -0.009041707962751389, 0.007171276956796646, 0.022522777318954468, 0.03545280545949936, -0.015367022715508938, 0.014047631062567234, -0.02267799898982048, -0.02064458467066288, 0.002371022943407297, -0.03200686722993851, -0.007085904479026794, -0.018533559516072273, -0.10505454987287521, -0.05358278751373291, 0.029600920155644417, -0.011626161634922028, 0.017198646441102028, 0.027179449796676636, 0.005304727237671614, -0.04824313521385193, 0.03421102836728096, 0.004505331628024578, -0.004454884212464094, -0.007330379914492369, 0.053769055753946304, -0.0035119077656418085, 0.008777829818427563, 0.013527636416256428, 0.04222050681710243, 0.021948454901576042, 0.03905396908521652, -0.0013087192783132195, -0.008063806220889091, -0.006662923377007246, -0.01924758404493332, -0.017136558890342712, -0.03002002090215683, 0.03383849188685417, -0.033248648047447205, -0.069353386759758, -0.0321931354701519, -0.05867408588528633, 0.0006640610517933965, 0.004202647600322962, -0.006810384802520275, 0.00040163806988857687, -0.005937258712947369, -0.018471471965312958, -0.01424942072480917, 0.002320575527846813, 0.00032063137041404843, 0.018269682303071022, -0.039457544684410095, -0.044921375811100006, -0.01114497147500515, -0.016934769228100777, 0.009926475584506989, 0.0012737942161038518, -0.03380744904279709, -0.007105307653546333, 0.0034168341662734747, 0.005754872225224972, -0.032876115292310715, 0.032224178314208984, -0.035049229860305786, 0.014024347998201847, -0.008630367927253246, -0.0034362368751317263, 0.0029938530642539263, 0.06395164877176285, -0.027396762743592262, -0.06960174441337585, -0.0022759491112083197, -0.029585398733615875, -0.0352044515311718, -0.02988032065331936, 0.023531723767518997, 0.0015667765401303768, 0.030749566853046417, -0.03880561143159866, 0.03126180171966553, -0.04622524604201317, -0.006080839317291975, -0.01114497147500515, 0.01127691101282835, 0.022103676572442055, 0.011548549868166447, -0.00697336858138442, -0.012627346441149712, -0.01861117221415043, -0.02202606573700905, -0.031851645559072495, -0.038960833102464676, 0.03152567893266678, -0.005219354759901762, -0.03619787469506264, 0.032131046056747437, 0.006884115748107433, 0.003993097227066755, -0.007213963195681572, -0.005906214006245136, 0.01710551418364048, -0.04498346522450447, 0.00904946867376566, 0.0017743866192176938, 0.03660145401954651, -0.030051065608859062, -0.0002558745036367327, 0.002541767666116357, 0.0014862549724057317, 0.047560159116983414, 0.02393530122935772, -0.06798743456602097, 0.02559618093073368, 0.020318618044257164, -3.5046319680986926e-05, 0.02710183896124363, 0.014412404038012028, -0.016546713188290596, 0.05013684928417206, 0.014652999117970467, -0.03334178030490875, -0.030252855271100998, 0.0029395250603556633, -0.0018762513063848019, -0.029181819409132004, -0.014404643326997757, 0.022724566981196404, 0.009010663256049156, -0.0027804221026599407, 0.0580221489071846, 0.018223116174340248, -0.026123937219381332, 0.049329694360494614, -0.021731141954660416, -0.010252442210912704, -0.04495242238044739, -0.004897268023341894, -0.05144071951508522, -0.01026796456426382, -0.00500592403113842, 0.020613541826605797, 0.008886485360562801, 0.025891104713082314, 0.025642748922109604, -0.003936829511076212, -0.0027687803376466036, -0.043648552149534225, 0.013217191211879253, 0.008832157589495182, 0.023919779807329178, 0.008933051489293575, -0.03430416062474251, 0.0002781877410598099, 0.004924431908875704, -0.003952351398766041, 0.0033217603340744972, -0.016018956899642944, 0.004439361859112978, 0.04575957730412483, 0.034894004464149475, 0.014288226142525673, -0.003488624468445778, 0.025658270344138145, -0.020489362999796867, 0.01444344874471426, -0.020039217546582222, -0.05373801290988922, 0.032348357141017914, -0.0034129535779356956, -0.010609454475343227, -0.014140765182673931, -0.006550387479364872, -0.019915040582418442, 0.03216209262609482, -0.03734651952981949, 0.004943835083395243, -0.00987990852445364, -0.0007824181229807436, 0.021249953657388687, 0.0029647487681359053, -0.0036904136650264263, -0.024292312562465668, -0.005882930941879749, -0.03570116311311722, -0.030563300475478172, 0.009740208275616169, -0.009422002360224724, -0.031137622892856598, -0.02913525328040123, 0.01696581393480301, 0.02011682838201523, -0.03877456858754158, -0.019682206213474274, 0.027831384912133217, 0.015739556401968002, -0.0016327460762113333, 0.0380294993519783, 0.01323271356523037, 0.02686900645494461, -0.030594345182180405, -0.006767698563635349, 0.015421350486576557, -0.06308240443468094, -0.022088155150413513, -0.060971375554800034, -0.04153752699494362, -0.040854547172784805, -0.02190188691020012, -0.0003567691019270569, -0.0029220625292509794, -0.0021963976323604584, 0.05994690954685211, 0.028328096494078636, 0.01763327047228813, -0.03837098926305771, -0.01507986057549715, 0.05584903806447983, 0.00011308198008919135, 0.027551984414458275, -0.020861897617578506, 0.05221683159470558, 0.01102855522185564, -0.04830522462725639, -0.006146809086203575, 0.012146156281232834, 0.002142069861292839, -0.047839559614658356, 0.0666835606098175, -0.03380744904279709, 0.04076141491532326, 0.06317553669214249, -0.047560159116983414, 0.004338467493653297, -0.007780525367707014, 0.009507374837994576, -0.06550387293100357, 0.021234430372714996, -0.011385566554963589, -0.019449371844530106, -0.02586006000638008, 0.021327564492821693, -0.010679304599761963, -0.02354724518954754, 0.035018183290958405, -0.03812263160943985, -0.02573588117957115, 0.008475145325064659, -0.02281769923865795, -0.019294150173664093, -0.05712186172604561, 0.04054410383105278, -0.024369923397898674, 0.026837961748242378]
1. Markdown文档中的表格检索方式
1.数据解析 (Parsing Phase) 目标:把“混沌”的 Markdown 文档拆解成此“条理分明”的结构化节点。
-
输入:原始 Markdown 文件(包含文本、标题、表格)。
-
工具:MarkdownElementNodeParser。
-
动作:
-
扫描:扫描全文,识别 |---|---| 格式的表格。
-
提取表格 (Object Extraction):
-
将识别到的表格从原文中“抠”出来,转换成 TableNode(本质是 DataFrame 结构,或者保留原始 Markdown 表格文本)。
-
这部分内容不直接切片进入向量库,因为它包含了复杂的结构,直接切片会破坏语义。
-
-
生成摘要 (Summary Generation):
-
关键一步:Parser 会调用 LLM(如 GPT-3.5),给每个提取出来的表格生成一段自然语言摘要(Summary)。
-
例子:LLM 生成“这是一个关于公司各部门2023年Q3财务数据的表格,包含研发、销售部的收入和成本。”
-
-
构建基座节点 (Base Nodes):
-
将剩余的普通正文切片成 TextNode。
-
创建一个 IndexNode,内容是上面的表格摘要,并带有一个指针(index_id)指向那个完整的 TableNode。
-
-
import os
import pandas as pd
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import TextNode, IndexNode # 确保导入这两个类
# 0. 准备测试数据 (确保表格格式标准)
markdown_content = """
# 公司季度财报
以下是我们在2023年第三季度的详细财务表现:
| 部门 | 收入(万元) | 成本(万元) | 利润(万元) |
|------|------------|------------|------------|
| 研发部 | 0 | 500 | -500 |
| 销售部 | 2000 | 800 | 1200 |
| 产品部 | 500 | 200 | 300 |
总结来看,销售部表现最为亮眼。
"""
doc = Document(text=markdown_content)
# 1. 创建解析器
node_parser = MarkdownElementNodeParser(
llm=Settings.llm,
num_workers=1 # 并行解析数
)
# 2. 正确的解析流程
# 第一步: 从文档获取原始节点
raw_nodes = node_parser.get_nodes_from_documents([doc])
# 第二步: 从节点中提取基座节点和表格对象
base_nodes, objects = node_parser.get_nodes_and_objects(raw_nodes)
print(f"解析完成: 生成了 {len(base_nodes)} 个基座节点, {len(objects)} 个表格对象")
# 3. 验证表格对象
print(f"\nobjects 类型: {type(objects)}")
if objects:
print("\n✅ 表格对象提取成功!")
for i, obj in enumerate(objects):
print(f" 对象 {i}:")
print(f" 类型: {type(obj).__name__}")
print(f" ID: {obj.node_id if hasattr(obj, 'node_id') else 'N/A'}")
# 如果是表格节点,尝试显示内容
if hasattr(obj, 'text'):
print(f" 内容预览: {obj.text[:100]}...")
if hasattr(obj, 'metadata'):
print(f" 元数据: {obj.metadata}")
else:
print("\n⚠️ 未提取到表格,调试信息:")
for i, node in enumerate(raw_nodes):
print(f" 节点 {i}: {type(node).__name__} - {node.text[:60]}...")
1it [00:00, 15141.89it/s]
解析完成: 生成了 2 个基座节点, 1 个表格对象
objects 类型: <class 'list'>
✅ 表格对象提取成功! 对象 0: 类型: IndexNode ID: 4a1bfe0f-4e79-4652-b8a1-3673250d0a1b 内容预览: This table summarizes the financial performance of different departments, showing their income, cost... 元数据: {'col_schema': ''}
- 检索与生成 (Retrieval & Generation Phase) 目标:用户提问,系统精准找到表格并 回答。
-
工具:RecursiveRetriever (递归检索器)。
-
流程:
-
第一级检索 (Level-1 Retrieval):
-
用户问:“销售部门去年的利润怎么样?”
-
检索器在 Vector Store 中搜索。
-
由于表格摘要里有“...销售部的收入和成本...”,语义匹配度很高,检索器找回了这个 IndexNode(表格摘要)。
-
-
递归解析 (Recursive Step):
-
RecursiveRetriever 拿到 IndexNode 后,发现它手里捏着一个 index_id。
-
它根据这个 ID 去 DocStore (objects) 里抓取完整的表格对象。
-
现在,检索器手里拿到了完整的 Markdown 表格或 DataFrame。
-
-
上下文组装 (Synthesis):
-
如果检索到了其他普通 TextNode(正文),也一并带上。
-
最终的 Prompt 变成了:
[上下文]
1. (正文片段) 公司Q3战略重点是...
2. (表格内容) | 部门 | 收入 | ... | 销售部 | 2000 | ...
[问题]
销售部门去年的利润怎么样?
-
-
LLM 生成:
- LLM 看到完整的表格数据,轻松计算或提取出“1200万元”,生成最终回答。
# 4. 过滤 base_nodes (只保留 TextNode 和 IndexNode)
base_nodes_filtered = [n for n in raw_nodes if isinstance(n, (TextNode, IndexNode))]
print(f"\n过滤后基座节点数: {len(base_nodes_filtered)}")
# 5. 建立向量索引 (只对摘要建索引)
print("\n开始建立向量索引...")
vector_index = VectorStoreIndex(nodes=base_nodes_filtered)
vector_retriever = vector_index.as_retriever(similarity_top_k=2)
# 6. 构建递归检索器
# 将 objects 列表转换为字典 {node_id: node}
objects_dict = {obj.node_id: obj for obj in objects}
recursive_retriever = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever},
node_dict=objects_dict, # 传入表格对象字典
verbose=True # 开启日志,可以看到递归过程
)
print("\n✅ 递归检索器构建完成!")
# 7. 执行检索测试
print("\n" + "="*50)
print("开始检索测试")
print("="*50)
query = "销售部的利润是多少?"
print(f"\n查询: {query}")
retrieved_nodes = recursive_retriever.retrieve(query)
print(f"\n检索到 {len(retrieved_nodes)} 个结果:")
for i, node_with_score in enumerate(retrieved_nodes):
print(f"\n--- 结果 {i+1} (相关度: {node_with_score.score:.3f}) ---")
print(f"节点类型: {type(node_with_score.node).__name__}")
print(f"内容:\n{node_with_score.node.text[:300]}...")
# 8. 结合 LLM 生成最终答案
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import Settings
query_engine = RetrieverQueryEngine.from_args(
recursive_retriever,
llm=Settings.llm
)
print("\n" + "="*50)
print("生成最终答案")
print("="*50)
response = query_engine.query(query)
print(f"\n问题: {query}")
print(f"\n答案: {response}")
print(f"\n来源节点数: {len(response.source_nodes)}")
过滤后基座节点数: 4
开始建立向量索引...
✅ 递归检索器构建完成!
================================================== 开始检索测试
查询: 销售部的利润是多少? [1;3;34mRetrieving with query id None: 销售部的利润是多少? [0m[1;3;38;5;200mRetrieving text node: This table summarizes the financial performance of different departments, showing their income, costs, and profits.
Table Title/Caption: Not provided in the context. Table ID: Not provided in the context. Should the table be kept: Yes, as it provides valuable financial information., with the following columns:
| 部门 | 收入(万元) | 成本(万元) | 利润(万元) |
|---|---|---|---|
| 研发部 | 0 | 500 | -500 |
| 销售部 | 2000 | 800 | 1200 |
| 产品部 | 500 | 200 | 300 |
[0m[1;3;38;5;200mRetrieving text node: 总结来看,销售部表现最为亮眼。 [0m 检索到 2 个结果:
--- 结果 1 (相关度: 0.599) --- 节点类型: TextNode 内容: This table summarizes the financial performance of different departments, showing their income, costs, and profits.
Table Title/Caption: Not provided in the context. Table ID: Not provided in the context. Should the table be kept: Yes, as it provides valuable financial information., with the follo...
--- 结果 2 (相关度: 0.560) --- 节点类型: TextNode 内容: 总结来看,销售部表现最为亮眼。...
================================================== 生成最终答案
[1;3;34mRetrieving with query id None: 销售部的利润是多少? [0m[1;3;38;5;200mRetrieving text node: This table summarizes the financial performance of different departments, showing their income, costs, and profits.
Table Title/Caption: Not provided in the context. Table ID: Not provided in the context. Should the table be kept: Yes, as it provides valuable financial information., with the following columns:
| 部门 | 收入(万元) | 成本(万元) | 利润(万元) |
|---|---|---|---|
| 研发部 | 0 | 500 | -500 |
| 销售部 | 2000 | 800 | 1200 |
| 产品部 | 500 | 200 | 300 |
[0m[1;3;38;5;200mRetrieving text node: 总结来看,销售部表现最为亮眼。 [0m 问题: 销售部的利润是多少?
答案: 销售部的利润是1200万元。
来源节点数: 2
2. PDF文档中表格解析检索
-
核心架构:LlamaParse + MarkdownElementNodeParser
-
实现思路
-
PDF → Markdown 转换:使用 LlamaParse(LlamaIndex 官方的 PDF 解析工具)将 PDF 转换为结构化的 Markdown 格式,表格会被精准保留为 Markdown 表格
-
Markdown → 节点拆分:使用 MarkdownElementNodeParser 提取表格并生成摘要
-
递归检索:与之前 Markdown 案例完全一致
-
重要说明
-
LlamaParse API Key 需要在 https://cloud.llamaindex.ai 注册账号 免费版有每日解析页数限制(通常1000页/天) 将 API Key 替换代码中的 "llx-..."
-
成本优化 LlamaParse 是付费服务,如果文档较大建议先测试几页 可以通过 pages=[1, 2, 3] 参数只解析指定页
-
备选方案 (如果不想用 LlamaParse) 使用开源工具如 pdfplumber 或 camelot 提取表格,然后手工转换为 Markdown 但这种方式对复杂表格(跨行跨列)的处理效果通常不如 LlamaParse


import os
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.schema import TextNode, IndexNode
from llama_parse import LlamaParse
from dotenv import load_dotenv
load_dotenv(override=True)
# ========== 步骤 0: 环境配置 ==========
# 1. 安装依赖: pip install llama-parse llama-index
# 2. 获取 API Key: https://cloud.llamaindex.ai (需要注册 LlamaCloud 账号)
# 3. 设置环境变量LLAMA_CLOUD_API_KEY
# os.environ["LLAMA_CLOUD_API_KEY"] = "llx-..."
# ========== 步骤 1: 使用 LlamaParse 解析 PDF ==========
print("📄 开始解析 PDF...")
parser = LlamaParse(
result_type="markdown", # 输出格式为 Markdown (保留表格结构)
verbose=True,
language="ch_sim", # 简体中文
# 可选: 提供自定义的解析指令
# parsing_instruction="重点提取财务数据表格,保留表头和数值"
)
# 解析 PDF 文件
pdf_path = "dataset/招股意向书.pdf"
# 添加错误处理
try:
documents = parser.load_data(pdf_path)
print(f"✅ PDF 解析完成,共生成 {len(documents)} 个文档片段")
# 检查是否为空
if not documents:
print("⚠️ 警告: 解析结果为空!")
print("可能的原因:")
print("1. PDF 文件路径不正确")
print("2. LlamaParse API 调用失败")
print("3. API Key 未正确配置")
print(f"\n请检查:")
print(f"- 文件是否存在: {os.path.exists(pdf_path)}")
print(f"- API Key 是否已设置: {bool(os.getenv('LLAMA_CLOUD_API_KEY'))}")
else:
print(f"预览第一个片段:\n{documents[0].text[:300]}...\n")
except Exception as e:
print(f"❌ PDF 解析失败: {e}")
print("\n调试信息:")
print(f"- 文件路径: {pdf_path}")
print(f"- 文件存在: {os.path.exists(pdf_path)}")
print(f"- API Key 已设置: {bool(os.getenv('LLAMA_CLOUD_API_KEY'))}")
raise
📄 开始解析 PDF... Started parsing the file under job_id d1cc9269-2046-487c-9108-1824deabc887 ✅ PDF 解析完成,共生成 2 个文档片段 预览第一个片段: 海尔施生物医药股份有限公司 招股意向书
目前,发行人已有 3 项分子诊断产品取得国家食药总局颁发的医疗器械注册证。
此外,发行人基于常染色体 21 个基因位点、常染色体 27 个基因位点、Y 染色体 27 个基因位点识别技术的 3 项 DNA 法医检测试剂已投放市场,其中前两项已获中国安全技术防范认证中心授予的《中国公共安全产品认证证书》,发行人成为目前获得此项认证的少数几家生产厂商之一。
为进一步延伸产业链,充分利用发行人在体外诊断产品经营领域的客户资源和发展经验,发行人还积极向医学诊断服务领域拓展,通过海尔施医学检验所为各级医疗机构...
# ========== 步骤 2: 使用 MarkdownElementNodeParser 提取表格 ==========
print("🔍 开始提取表格...")
node_parser = MarkdownElementNodeParser(
llm=Settings.llm, # 用于生成表格摘要
num_workers=4 # 并行处理
)
# 从文档获取原始节点
raw_nodes = node_parser.get_nodes_from_documents(documents)
# 提取基座节点和表格对象
base_nodes, objects = node_parser.get_nodes_and_objects(raw_nodes)
print(f"✅ 提取完成:")
print(f" - 基座节点(文本+摘要): {len(base_nodes)} 个")
print(f" - 表格对象(完整表格): {len(objects)} 个\n")
# 调试:查看提取到的表格
if objects:
print("