
    wi>                        d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
Z
d dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ defd	Z e
j                    d
efd            Z e
j                    d             Z e
j                    defd            Z d Z! e
j                    d             Z" e
j        d          d             Z# e	d           G d d                      Z$ G d de          Z%dS )    )BaseBackend	GPUTarget)irpassesllvmnvidia)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                     d S )Nc                 2    |                                  rdndS )N)       r   )r   r   r   )is_int8)lhsTyperhsTypes     o/root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/triton/backends/nvidia/compiler.py<lambda>zmin_dot_size.<locals>.<lambda>   s    GOO4E4E$WLL<      r   s    r   min_dot_sizer      s    WWWr   binaryc                    t           j                            d|                                  dd          t           j                            t           j                            t                    d|           g}|D ]}t           j                            |          rt           j        	                    |          rst          j        |dgt          j                  }|Ot          j        d|                    d          t          j        	          }|||                    d
          fc S t%          d|            )NTRITON__PATH bin	--version)stderrz.*release (\d+\.\d+).*utf-8flags   zCannot find )osenvirongetupperpathjoindirname__file__existsisfile
subprocesscheck_outputSTDOUTresearchdecode	MULTILINEgroupRuntimeError)r   pathsr#   resultversions        r   _path_to_binaryr@      s    	
6666;;
RW__X..v>>E
  1 17>># 	127>>##6#6 	1,c;-?
HYZZZF!)$=v}}W?U?U]_]ijjj&a 0 00000
.f..
/
//r   c                  ~    t          j        t          d          d         dg                              d          } | S )Nptxasr   r$   r&   )r4   r5   r@   r9   )r?   s    r   get_ptxas_versionrC   &   s8    %w'?'?'BK&PQQXXY`aaGNr   returnc                    t          | t                    sJ t          t          |                     d                    \  }}|dk    r|dk     rd|z   S |dk    rdS |dk    rd|z   S |dk    rd	|z   S t          d
| z             )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P   U      F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapintsplitr<   )cuda_versionmajorminors      r   ptx_get_versionrW   ,   s    
 lC(((((sL..s3344LE5{{199:aZZ2{{Ez{{Ez
X[gg
h
hhr   c                 Z    | j         }|!t          d          \  }}t          |          }|S )NrB   )ptx_versionr@   rW   )optionsrY   _rT   s       r   get_ptx_version_from_optionsr\   ?   s4    %K)'22<%l33r   c                 N    t          |           }t          d|          }d| }|S )NS   z+ptx)r\   min)rZ   rY   llvm_ptx_versionfeaturess       r   get_featuresrb   G   s4    .w77K 2{++(&((HOr   c                     t          | d          5 }t          j        |                                                                          cd d d            S # 1 swxY w Y   d S )Nrb)openhashlibsha256read	hexdigest)r.   fs     r   	file_hashrk   U   s    	dD		 4Q~affhh''11334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4s   8AAAT)frozenc                   d   e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   dZ	eed	<   dZ
eed
<   dZeed<   dZee         ed<   dZeed<   dZeed<   dZeed<   dZee         ed<   dZee         ed<   dZeed<   dZee         ed<   dZeed<   dZeed<   dZeed<   dZeed <   dZeed!<   d" Zd# Z dS )$CUDAOptions   	num_warpsr)   num_ctas   
num_stagesr   num_buffers_warp_specnum_consumer_groupsreg_dec_producerreg_inc_consumerNmaxnreg)r)   r)   r)   cluster_dimsrY   Tenable_fp_fusion)fp8e5fp8e4b15supported_fp8_dtypesr   deprecated_fp8_dtypestf32default_dot_input_precision)r   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsFdebugcudabackend_namesanitize_overflowc                    t          t                    j        dz  }| j        i nt	          | j                  }|                    dd           s(t          j        dt          |dz                      |d<   t          
                    | dt          |                                                     | j        dk    r| j        | j        dz
  z  dk    s
J d            d S )	Nlib	libdeviceTRITON_LIBDEVICE_PATHzlibdevice.10.bcr   r   r)   znum_warps must be a power of 2)r   r1   parentr   dictr,   r*   getenvrP   object__setattr__tupleitemsrp   )selfdefault_libdirr   s      r   __post_init__zCUDAOptions.__post_init__t   s    h.6 ,4bb$t?O:P:P{D11 	s')y1H#n_pNpJqJq'r'rK$4k6G6G6I6I0J0JKKK~!!t~!9K'LQR&R&R&R/ 'S&RR&R&Rr   c                 v   t          | j                  }t          d t          |d                   D                       |d<   d                    d t          |                                          D                       }t          j        |                    d                    	                                S )Nc              3   >   K   | ]\  }}|t          |          fV  d S N)rk   ).0kvs      r   	<genexpr>z#CUDAOptions.hash.<locals>.<genexpr>   s1      (h(htq!!Yq\\):(h(h(h(h(h(hr   r   r[   c                 "    g | ]\  }}| d | S )-r   )r   namevals      r   
<listcomp>z$CUDAOptions.hash.<locals>.<listcomp>   s&    SSSID#4#SSSr   r&   )
r   __dict__r   sortedr/   r   rf   rg   encoderi   )r   	hash_dictkeys      r   hashzCUDAOptions.hash}   s    ''	#((h(hviXeNfGgGg(h(h(h#h#h	- hhSS	@Q@Q9R9RSSSTT~cjj1122<<>>>r   )!__name__
__module____qualname__rp   rR   __annotations__rq   rs   rt   ru   rv   rw   rx   r   ry   r   rY   rz   boolr}   r   rP   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   rn   rn   [   s        IsHcJ!"3"""    cc "GXc]!!!#L%###K!d!!!'<%*<<<(*5:***'----/I %*III*.!4...KE4L#"t"""0 0 0? ? ? ? ?r   rn   c                   "    e Zd Zedefd            Zdeddf fdZdefdZd Z	d Z
deeef         fd	Zd
 Zed             Zed             Zed             Zed             Zed             Zd Z ej                    d             Z xZS )CUDABackendr   c                     | j         dk    S )Nr   )backendr   s    r   supports_targetzCUDABackend.supports_target   s    ~''r   rD   Nc                     t                                          |           |j        | _        t	          | j        t
                    sJ d| _        d S )Ncubin)super__init__arch
capabilityrO   rR   
binary_ext)r   r   	__class__s     r   r   zCUDABackend.__init__   sG        +$/3/////!r   c                    fdt           j                                        D             }d|vrXt          t           j                  }| j        dk    r|                    d           t          t          |                    |d<   d|vr| j        dk    rd|d<   d|vrt          j
        d	d
          d
k    |d<   | j        dk    rdnd|d<   t          di |S )Nc                 *    i | ]}|v ||         S r   r   )r   r   optss     r   
<dictcomp>z-CUDABackend.parse_options.<locals>.<dictcomp>   s%    YYYqqTXyy47yyyr   r}   Y   fp8e4nvr~   Z   )r|   rz   TRITON_DEFAULT_FP_FUSION1i   @r   r   r   )rn   __dataclass_fields__keyssetr}   r   addr   r   r*   r   )r   r   argsr}   s    `  r   parse_optionszCUDABackend.parse_options   s    YYYYK$D$I$I$K$KYYY!--#&{'G#H#H "$$$((333+08L1M1M+N+ND'("$.."$$0>,-T))')y1KS'Q'QUX'XD#$9=B9N9NTU,-""T"""r   c                 r    |j         |j        |j        |j        d         |j        d         |j        d         fS )Nr   r)      )rp   rq   sharedry   )r   metadatas     r   pack_metadatazCUDABackend.pack_metadata   s>    O!!$!!$!!$
 	
r   c                 z    dd l mc mc m} | j        dk    r|j        n|j        t          | j                  d}|S )Nr   rI   )convert_custom_typesr   )	triton.language.extra.cudalanguageextrar   r   convert_custom_float8_sm80convert_custom_float8_sm70r   r   )r   r   codegen_fnss      r   get_codegen_implementationz&CUDABackend.get_codegen_implementation   sc    111111111111 04"/D/DD++$Ji(55
 

 r   c                     ddl m} d|iS )Nr   )r   ztriton.language.extra.libdevice)r   r   )r   r   s     r   get_module_mapzCUDABackend.get_module_map   s    88888819==r   c                 .    t          j        |           d S r   )r   load_dialects)r   ctxs     r   r   zCUDABackend.load_dialects   s    S!!!!!r   c                    t          j        | j                  }|                                 t          j                            |           t          j                            |           t          j        	                    |           t          j        
                    |           t          j                            |           t          j                            |           t          j                            |           t          j                            |           t          j                            |           |                    |            | S r   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointeradd_combineadd_canonicalizeradd_reorder_broadcastadd_cseadd_licmadd_symbol_dceadd_loop_unrollrun)modr   optpms       r   	make_ttirzCUDABackend.make_ttir   s    _S[))
!!"%%%..r222###''+++))"---b!!!r"""$$R(((##B'''
s
r   c                 	   t          j                    }|j        6|j        d         |_        |j        d         |_        |j        d         |_        t          j                            dd          dk    rGt          j
                    }t          j        || j                  }| j                            d           t          j        | j                  }|                                 t"          j                            |d| |j        d	|j                   t"          j                            |           |d
z  dk    rt"          j                            |           t           j        j                            ||           t"          j                            |           t"          j                            |           t"          j                            |           t"          j                            |           t"          j                            ||dk               t"          j                             |           |d
z  dk    r.t"          j        !                    |           t"          j        "                    |           t"          j        #                    ||j$                   t"          j        %                    ||j$                   t"          j        &                    ||j$                   t"          j        '                    ||j(        |j$        |j)        |j*                   t"          j        +                    ||j,                   t"          j        -                    ||j$                   t"          j        .                    |           t"          j                            ||dk               t"          j                            |           t"          j        /                    |           t"          j        0                    |           t"          j                             |           t"          j        1                    |           |d
z  dk    rHt           j        j        2                    |           t           j        j        3                    |           t"          j        4                    |           |5                    |            |j        |j        |j        f|d<   | S )Nr   r)   r   MLIR_ENABLE_REMARK0r   Tzcuda:r   rM      rI   	   ry   )6r   ClusterInfory   clusterDimXclusterDimYclusterDimZr*   r+   r,   r   
source_mgrr   source_mgr_diagr   printOpOnDiagnosticr   r   r   r   add_convert_to_ttgpuirrp   rq   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operandsr   r   add_optimize_accumulator_init add_combine_tensor_select_and_ifadd_ws_task_partitionru   add_taskid_propagateadd_ws_data_partitionadd_ws_code_partitionrt   rv   rw   add_pipeliners   add_ws_loweringadd_prefetchadd_reduce_data_duplicationadd_reorder_instructionsr   add_fence_insertionadd_tma_loweringr   r   )r   r   r   r   cluster_infosrcMgrdiagr   s           r   
make_ttgirzCUDABackend.make_ttgir   s   )++''*'7':L$'*'7':L$'*'7':L$:>>.44;;_&&F%fck::DK++D111_S[))
**2/Cz/C/CS]TVX[Xdeee##B'''q  N))"---,,R>>>44R88833B777,,R00044R88800Z25EFFFb!!!q  N88<<<N;;B???N00S5LMMMN//C4KLLLN00S5LMMMN00S5NPSPg141EsG[] ] ]N''CN;;;N**2s/FGGG##B'''00Z25EFFF44R888222666//333b!!!$$R(((q  M#77;;;M#44R888''+++
s$0$<l>VXdXp#q 
r   c                    t          |          }|                     d          }||dxx         |z  cc<   | }t          j        |j                  }|                                 t          j                            dd          dk    rGt          j
                    }t          j        ||j                  }	|j                            d           t          j        j                            |           t          j                            |           t          j                            |           t          j                            |           t          j                            |           t          j        j                            |||           t          j        j                            |           t          j                            |           t          j                            |           t          j                            |           t          j                            |           t          j                            dd          dk    rt          j                            |           |                     |           t          j!                     t          j                    }
t          j"        ||
          }|dk    rd	nd
| }tG          |          }d}t          j$        ||||           t          j%        |           |j&        Y|'                                D ]D}|(                                s.|)                                r|*                    |j&                   E|j+        r&d |j+        D             }t          j,        ||           t          j-        |t          j.                   |                     d          |d<   t_          |          }~~
|S )Nz"triton_gpu.num-warp-groups-per-ctarp   r   r   r   TTRITON_DISABLE_LINE_INFOr   sm_90asm_nvptx64-nvidia-cudac                     g | ]\  }}|S r   r   )r   r   r.   s      r   r   z)CUDABackend.make_llir.<locals>.<listcomp>1  s    BBBltTTBBBr   ztriton_gpu.sharedr   )0r\   get_int_attrr   r   r   r   r*   r+   r,   r   r   r   r   r   r   r   %add_decompose_unsupported_conversionsr	  convertadd_scf_to_cfadd_index_to_llvmiradd_allocate_shared_memoryadd_to_llvmirr  add_nvgpu_to_llvmadd_arith_to_llvmirr   r   r   r   llvmiradd_di_scoper   init_targets	to_modulerb   attach_datalayoutset_nvvm_reflect_ftzrx   get_functionsis_declarationis_external_linkageset_nvvm_maxnregr   link_extern_libsoptimize_moduleOPTIMIZE_O3rP   )srcr   rZ   r   rY   num_warp_groupsr   r   r  r  r   llvm_modprocra   tripler   r=   rets                     r   	make_llirzCUDABackend.make_llir   s_   27;; **+OPP&[!!!_4!!!_S[))
:>>.44;;_&&F%fck::DK++D111CCBGGG77;;;$$R(((**2...11"555++B
KHHH11"555**2...''+++b!!!$$R(((:>>4c::cAAM&&r***
s,..>#w//%++xx1Cz1C1C((&xx@@@#H--- ?&++-- 8 8'')) 8a.C.C.E.E 8&&w777 	3BBg.ABBBE!(E222Xt'7888 !--.ABB(mm
r   c           	         t          |          }d}|dk    rdnd| }t          |          }t          j        | |||dg|j        d          }t          j        d|          }	t          |	          dk    sJ |	d	         |d
<   |dz   d|dz   }t          j        dd| |t
          j	                  }t          j        dd|          }t          j                            dd          dk    rt          d           t          |           |S )Nr  r   r  r  znvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r)   r   r   rM   rF   z\.version \d+\.\d+z	.version r'   z,\s*debug|debug,\s*r"   NVPTX_ENABLE_DUMPr   r   z // -----// NVPTX Dump //----- //)r\   rb   r   translate_to_asmrz   r7   findalllensubr:   r*   r+   r,   print)
r5  r   r   r   rY   r9  r8  ra   r:  namess
             r   make_ptxzCUDABackend.make_ptx=  s%   2377&%++xx1Cz1C1C$$#CxBSATVYVjlqrr
FLL5zzQ 8$b;;;r>;;f*,E,E,EsRTR^___f+R55:>>-s33s::4555#JJJ
r   c                 8   t          d          \  }}t          j        ddd          5 }t          j        ddd          5 }|                    |            |                                 |j        dz   }t          j                            d	          rg nd
g}	|j	        rg ndg}
|dk    rdnd}t          j                            dd          dk    rddgng }|g|	|
d|d| | |j        d|}	 t          j        |dd|           t          j                            |j                  rt          j        |j                   t          j                            |j                  rt          j        |j                   n# t          j        $ r}t!          |j                  5 }|                                }d d d            n# 1 swxY w Y   t          j                            |j                  rt          j        |j                   |j        dk    rd}n%|j        dt&          j        z   k    rd}n
d|j         }t+          | d| dd                    |           d           d }~ww xY wt!          |d!          5 }|                                }d d d            n# 1 swxY w Y   t          j                            |          rt          j        |           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |S )"NrB   Fwz.ptx)deletemodesuffixrz.logz.or  z	-lineinfoz--fmad=falser   ar"   DISABLE_PTXAS_OPTr   r   z--opt-levelz-vz--gpu-name=sm_z-oT)check	close_fdsr%      z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command:  
rd   )r@   tempfileNamedTemporaryFilewriteflushr   r*   r+   r,   rz   r4   r   r.   r2   removeCalledProcessErrorre   rh   
returncodesignalSIGSEGVr<   r/   )r5  r   r   r   rB   r[   fsrcflogfbin	line_infofmadrI  	opt_level	ptxas_cmdelog_filelogerrorrj   r   s                       r   
make_cubinzCUDABackend.make_cubinS  s>   "7++q(COOO '	 SW'u3vNNN'	 RVJJsOOOJJLLL9t#D jnn-GHH[{mI-C22N3CD&",,SS"F02
?RTW0X0X\_0_0_,,egI!$(*.1:<aZ<aY_<a<acgclnrtxINydSSSS7>>$),, )Idi(((7>>$),, )Idi(((0 N N N$)__ *"--//C* * * * * * * * * * * * * * *7>>$),, )Idi(((<3&&?EE\S6>%9994EELalLLE"e $M $M7:$M $M58XXi5H5H$M $M $M N N NN" dD!! !Q! ! ! ! ! ! ! ! ! ! ! ! ! ! !w~~d##  	$O'	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	  '	 P s   LB5K88BF
K8I:I5.G	I5GI5GBI55I::K8J."K8.J22K85J266K8,L8K<	<L?K<	 LLLc                 d      fd|d<    fd|d<    fd|d<    fd|d<    fd	|d
<   d S )Nc                 2                         | |          S r   )r   r5  r   rZ   r   s     r   r   z(CUDABackend.add_stages.<locals>.<lambda>  s    t~~c8W/U/U r   r   c                 >                         | |j                  S r   )r  r   rj  s     r   r   z(CUDABackend.add_stages.<locals>.<lambda>      XwX\Xg0h0h r   ttgirc                 >                         | |j                  S r   )r;  r   rj  s     r   r   z(CUDABackend.add_stages.<locals>.<lambda>  s    t~~c8WVZVe/f/f r   llirc                 >                         | |j                  S r   )rD  r   rj  s     r   r   z(CUDABackend.add_stages.<locals>.<lambda>  s    dmmC7TXTc.d.d r   ptxc                 >                         | |j                  S r   )rg  r   rj  s     r   r   z(CUDABackend.add_stages.<locals>.<lambda>  rl  r   r   r   )r   stagesrZ   s   ` `r   
add_stageszCUDABackend.add_stages  sq    UUUUUvhhhhhwfffffvddddduhhhhhwr   c                 6    t                      }| d| j         S )Nr   )rC   r   )r   r?   s     r   r   zCUDABackend.hash  s#    #%%--DO---r   )r   r   r   staticmethodr   r   r   r
   r   r   r   r   rP   r   r   r   r   r  r;  rD  rg  rt  	functools	lru_cacher   __classcell__)r   s   @r   r   r      s       (	 ( ( ( \("y "T " " " " " "#S # # # #"
 
 
  >S*_ 5 > > > >" " "   \ 2 2 \2h : : \:x   \* * * \*Xi i i Y. . . . . . .r   r   )&triton.backends.compilerr   r   triton._C.libtritonr   r   r   r   dataclassesr	   rw  typingr
   r   r   r   typesr   rf   r7   rS  rZ  r*   r4   pathlibr   r   rx  rP   r@   rC   rR   rW   r\   rb   rk   rn   r   r   r   r   <module>r     sr   ; ; ; ; ; ; ; ; 8 8 8 8 8 8 8 8 8 8 8 8 ! ! ! ! ! !     - - - - - - - - - - - -        				   				          X X X X X 0C 0 0 0 0    
 iS i i i i$   
 
 
 T4 4 4
 $%? %? %? %? %? %? %? %?PF. F. F. F. F.+ F. F. F. F. F.r   